<h1>Classifying news based on their content</h1>

<ol><b>
    <li>Vectorizing text data</li>
    <li>Dimensiality reduction</li>
    <li>Training classifiers</li>
</b></ol> 

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
import razdel
import pymorphy2

In [2]:
np.random.seed(42)
pd.set_option('max_colwidth', 120)

<h1>Data</h1>

In [3]:
df = pd.read_csv('fakenews_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
print(df['fake'].value_counts(), '\n')
print(df['fake'].value_counts(normalize=True))

In [None]:
print('Fake')
print((df[df['fake'] == 1]['content'].apply(len)).agg([np.min, np.max, np.mean, np.median]), '\n')

print('Not fake')
print((df[df['fake'] == 0]['content'].apply(len)).agg([np.min, np.max, np.mean, np.median]))

<h1>Vectorization</h1>

In [4]:
morph = pymorphy2.MorphAnalyzer(result_type=None)
def lemmatize(word):
    return morph.normal_forms(word)[0]

def tokenize(text):
    return [lemmatize(token.text) for token in razdel.tokenize(text)]
        
# class DenseTransformer(TransformerMixin):
#     def fit(self, X, y=None, **fit_params):
#         return self

#     def transform(self, X, y=None, **fit_params):
#         return X.todense()

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=0.0005, max_df=1.0)  # with min_df = 0.001: 9647 terms;;; 0.0005: 15256;;; 1: 59767
tfidf_docs = vectorizer.fit_transform(df['content'])

In [None]:
len(vectorizer.vocabulary_) + len(vectorizer.stop_words_)

In [None]:
tfidf_docs.shape

<h1>Dimensiality reduction (LSA)</h1>

In [5]:
from sklearn.decomposition import TruncatedSVD

In [None]:
def get_MSE(reconstructed_tfidf, tfidf_docs):
    return np.sqrt(((tfidf_docs.toarray() - reconstructed_tfidf).flatten()**2).mean()).round(4)

In [None]:
errors = []

params = [1, 2, 5, 10, 20, 50, 100, 150, 200]

for param in params:
    lsa = TruncatedSVD(n_components=param, n_iter=5, random_state=42)
    dtm = lsa.fit_transform(tfidf_docs)
    reconstructed_tfidf = lsa.inverse_transform(dtm)
    error = get_MSE(reconstructed_tfidf, tfidf_docs)
    errors.append(error)

In [None]:
from matplotlib import pyplot as plt

plt.ylabel('MSE')
plt.xlabel('n_components')
plt.scatter(params, errors)

In [None]:
lsa = TruncatedSVD(n_components=100, n_iter=5, random_state=42)

<h1>Classification</h1>

<h1>Baseline Model</h1>

In [None]:
# Baseline model - Zero rule classifier
# has accuracy 0.91
# but 0 precision

In [14]:
X_train, X_test, y_train, y_test = train_test_split(dtm, df['fake'], 
                                                    test_size=0.4, random_state=42)

NameError: name 'dtm' is not defined

<h1>Logistic regression</h1>

In [6]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_clf = LogisticRegression(class_weight={1: 10})
lr_clf.fit(X_train, y_train)

In [None]:
lr_clf.score(X_test, y_test)

In [None]:
prediction_lr = lr_clf.predict(X_test)
prediction_lr[prediction_lr < 0] = 0
prediction_lr[prediction_lr > 0] = 1

print(f'Recall: {recall_score(y_test, prediction_lr)}')
print(f'F1: {f1_score(y_test, prediction_lr)}')

<h1>SVC</h1>

In [7]:
from sklearn.svm import SVC

In [None]:
svc_clf = SVC(kernel='linear', random_state=42, class_weight={1: 10})

In [None]:
svc_clf.fit(X_train, y_train)

In [None]:
svc_clf.score(X_test, y_test)

In [None]:
prediction_svc = svc_clf.predict(X_test)

print(f'Recall: {recall_score(y_test, prediction_svc)}')
print(f'F1: {f1_score(y_test, prediction_svc)}')

<h1>Random Forest</h1>

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=100, class_weight={1: 10})

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
prediction_rf = rf.predict(X_test)
print(f'Recall: {recall_score(y_test, prediction_rf)}')
print(f'F1: {f1_score(y_test, prediction_rf)}')

<h1>Pipelines</h1>

In [9]:
from sklearn.pipeline import Pipeline

In [112]:
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['fake'], test_size=0.4, random_state=42)
X_full = np.hstack((X_train, X_test))

In [107]:
class TrainTestTransformer(TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.train = True

    def fit(self, X, y=None, rs=42, train=True, **fit_params):
        if train:
            self.train = True
        self.X_train, self.X_test = train_test_split(X, test_size=0.4, shuffle=False, random_state=rs)
        return self

    def transform(self, X, y=None, **fit_params):
        if self.train:
            self.train = False
            return self.X_train
        else:
            return self.X_test

In [113]:
estimators = [('vectorize', TfidfVectorizer(tokenizer=tokenize, min_df=1, max_df=1.0)), 
              ('reduce_dim', TruncatedSVD(n_components=25, random_state=42)),
              ('train_test_split', TrainTestTransformer()), 
              ('clf', SVC(kernel='linear', class_weight={1: 10}))]

pipe = Pipeline(estimators, verbose=True)

In [114]:
pipe.fit(X_full, y_train)

[Pipeline] ......... (step 1 of 4) Processing vectorize, total= 8.4min
[Pipeline] ........ (step 2 of 4) Processing reduce_dim, total=   1.9s
[Pipeline] .. (step 3 of 4) Processing train_test_split, total=   0.0s
[Pipeline] ............... (step 4 of 4) Processing clf, total=   0.1s


Pipeline(steps=[('vectorize',
                 TfidfVectorizer(tokenizer=<function tokenize at 0x0000026FD5C91DC8>)),
                ('reduce_dim', TruncatedSVD(n_components=25, random_state=42)),
                ('train_test_split',
                 <__main__.TrainTestTransformer object at 0x0000026FDC308808>),
                ('clf', SVC(class_weight={1: 10}))],
         verbose=True)

In [115]:
pred = pipe.predict(X_test)

In [116]:
print(f'Accuracy: {accuracy_score(y_test, pred)}')
print(f'Recall: {recall_score(y_test, pred)}')
print(f'F1: {f1_score(y_test, pred)}')

Accuracy: 0.9975476839237057
Recall: 0.9753424657534246
F1: 0.9875173370319001


<h1>Model selection with Grid Search</h1>

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
param_grid = dict(reduce_dim__n_components=[25, 50, 100, 200],
                  clf=[LogisticRegression(class_weight={1: 10}), SVC(kernel='rbf', class_weight={1: 10}), 
                       SVC(kernel='linear', class_weight={1: 10}), 
                       SVC(kernel='sigmoid', class_weight={1: 10}), 
                       RandomForestClassifier(n_estimators=100, class_weight={1: 10}), 
                       RandomForestClassifier(n_estimators=150, class_weight={1: 10}), 
                       RandomForestClassifier(n_estimators=50, class_weight={1: 10})])

estimators = [('reduce_dim', TruncatedSVD(random_state=42)),
              ('clf', None)]

In [12]:
pipe = Pipeline(estimators)

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=['accuracy', 'recall', 'f1'], 
                           refit='recall', verbose=0)

In [13]:
for min_df in [0.0001, 0.0005, 0.001, 0.002]:
    vectorizer = TfidfVectorizer(tokenizer=tokenize, min_df=min_df, max_df=1.0)
    tfidf_docs = vectorizer.fit_transform(df['content'])
    X_train, X_test, y_train, y_test = train_test_split(tfidf_docs, df['fake'], test_size=0.4, random_state=42)
    grid_search.fit(X_train, y_train)
    print(f'\n\nVectorization with min_df = {min_df}')
    print(grid_search.best_estimator_.get_params())
    pred = grid_search.best_estimator_.predict(X_test)

    print(f'Accuracy: {accuracy_score(y_test, pred)}')
    print(f'Recall: {recall_score(y_test, pred)}')
    print(f'F1: {f1_score(y_test, pred)}')
    print('-'*100, '\n')



Vectorization with min_df = 0.0001
{'memory': None, 'steps': [('reduce_dim', TruncatedSVD(n_components=25, random_state=42)), ('clf', SVC(class_weight={1: 10}))], 'verbose': False, 'reduce_dim': TruncatedSVD(n_components=25, random_state=42), 'clf': SVC(class_weight={1: 10}), 'reduce_dim__algorithm': 'randomized', 'reduce_dim__n_components': 25, 'reduce_dim__n_iter': 5, 'reduce_dim__random_state': 42, 'reduce_dim__tol': 0.0, 'clf__C': 1.0, 'clf__break_ties': False, 'clf__cache_size': 200, 'clf__class_weight': {1: 10}, 'clf__coef0': 0.0, 'clf__decision_function_shape': 'ovr', 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'rbf', 'clf__max_iter': -1, 'clf__probability': False, 'clf__random_state': None, 'clf__shrinking': True, 'clf__tol': 0.001, 'clf__verbose': False}
Accuracy: 0.9978201634877384
Recall: 0.9780821917808219
F1: 0.9889196675900276
---------------------------------------------------------------------------------------------------- 



Vectorization with min_df = 

Vectorization with min_df = 0.0001
{'memory': None, 'steps': [('reduce_dim', TruncatedSVD(n_components=25, random_state=42)), ('clf', SVC(class_weight={1: 10}))], 'verbose': False, 'reduce_dim': TruncatedSVD(n_components=25, random_state=42), 'clf': SVC(class_weight={1: 10}), 'reduce_dim__algorithm': 'randomized', 'reduce_dim__n_components': 25, 'reduce_dim__n_iter': 5, 'reduce_dim__random_state': 42, 'reduce_dim__tol': 0.0, 'clf__C': 1.0, 'clf__break_ties': False, 'clf__cache_size': 200, 'clf__class_weight': {1: 10}, 'clf__coef0': 0.0, 'clf__decision_function_shape': 'ovr', 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'rbf', 'clf__max_iter': -1, 'clf__probability': False, 'clf__random_state': None, 'clf__shrinking': True, 'clf__tol': 0.001, 'clf__verbose': False}
Accuracy: 0.9978201634877384
Recall: 0.9780821917808219
F1: 0.9889196675900276