In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the dataset

data = pd.read_csv('data/bbc-text.csv')
data.head(5)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [3]:
# Size of the dataset

data.size

4450

In [4]:
# Checking for null value

data.isna().sum()

category    0
text        0
dtype: int64

In [9]:
# Total categories

cat_lookup = dict(zip(data['category'].unique(), [0, 1, 2, 3, 4]))
cat_lookup

{'tech': 0, 'business': 1, 'sport': 2, 'entertainment': 3, 'politics': 4}

In [13]:
# Total number of words

num_words = data['text'].apply(lambda x: len(x.split(' '))).sum()

print(f"Total # of words: {num_words}")

Total # of words: 933960


### Model development

In [14]:
# Train Test Split

from sklearn.model_selection import train_test_split

X = data['text']
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

#### Logistics Regression with TfidfTransformer

In [16]:
from sklearn.linear_model import LogisticRegression

lr = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
lr.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=100000.0, n_jobs=1))])

In [17]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = lr.predict(X_test)

print(f'accuracy {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred,target_names=data.category.unique()))

accuracy 0.9707865168539326
               precision    recall  f1-score   support

         tech       0.97      0.91      0.94       101
     business       0.98      0.98      0.98        81
        sport       0.94      0.99      0.96        83
entertainment       0.99      1.00      0.99        98
     politics       0.98      0.99      0.98        82

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



#### SGDClassifier with TfidfTransformer

In [18]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [19]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = sgd.predict(X_test)

print(f'accuracy {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred,target_names=data.category.unique()))

accuracy 0.9730337078651685
               precision    recall  f1-score   support

         tech       0.96      0.93      0.94       101
     business       0.99      0.96      0.97        81
        sport       0.95      0.99      0.97        83
entertainment       0.99      1.00      0.99        98
     politics       0.98      0.99      0.98        82

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

