In [7]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from utils.string_clean import CleanStrings

In [3]:
HOME_PATH = Path.home()
PATH_TO_RAW_DATA = 'art_data/train_data/bbc_articles.csv'

In [4]:
df = pd.read_csv(HOME_PATH/PATH_TO_RAW_DATA)

Split the data into train and test

In [6]:
X = df[['article','article_len']].copy()
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Create a pipeline to classify each article

In [9]:
model_v1 = Pipeline(steps=[
    ('Clean_Articles', CleanStrings()),
    ('TF_IDF', TfidfVectorizer()),
    ('Logistic_Regression', LogisticRegression())
])

Fit the model

In [11]:
model_v1.fit(X_train['article'], y_train)



Pipeline(memory=None,
     steps=[('Clean_Articles', CleanStrings(alphanumeric={'e', 'Y', "'", 'J', '`', '\t', '0', '[', 'D', 'q', '!', '$', ')', 'c', 'A', '7', 'm', 'X', ',', 'G', 'p', 'w', '^', 'l', 'd', 'h', '1', 'C', 'W', 'x', '|', '-', 'B', 'f', '>', 'a', '#', '+', '\x0c', 's', '\\', 'j', 'u', 'E', '6', ' ', '\x0b', 'b',...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

What is the train accuracy?

In [27]:
print(f"Train Acc: {accuracy_score(y_train, model_v1.predict(X_train['article'])).round(3)}")

Train Acc: 0.997


In [28]:
y_pred = model_v1.predict(X_test['article'])

In [29]:
print(f"Test Acc: {accuracy_score(y_test, y_pred).round(3)}")

Test Acc: 0.973
