In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import scripts.data_pull as dp
import scripts.preprocess_text as ppt
import scripts.blosc_interface as bi

https://stackoverflow.com/questions/26826002/adding-words-to-stop-words-list-in-tfidfvectorizer-in-sklearn
https://stackoverflow.com/questions/50756085/how-to-print-the-progress-of-a-list-comprehension-in-python
https://stackoverflow.com/questions/57983431/whats-the-most-space-efficient-way-to-compress-serialized-python-data

In [2]:
full_corpus = bi.blosc_read("./data/tokenized_corpus.dat")

In [4]:
full_corpus.head()

Unnamed: 0,art_title,art_author,art_date,art_topic,art_link,art_source,text,art_bias,stemmed_tokens,text_tokens,tokens,stems,text_lem
0,The emerging markets e-commerce opportunity,Christoph Ungerer,2021-03-26,future-development,https://www.brookings.edu/blog/future-developm...,Brookings Institute,while e-commerce giants such as amazon dominat...,left-wing,"[e-commerc, giant, amazon, domin, headlin, cov...","[e-commerce, giants, amazon, dominate, headlin...","e-commerce, giants, amazon, dominate, headline...","e-commerc, giant, amazon, domin, headlin, covi...","e-commerce, giants, amazon, dominate, headline..."
1,"""1619"" Pulitzer Will Boost Socialist Teaching ...",Mike Gonzalez,2020-05-11,education,https://www.heritage.org/education/commentary/...,Heritage Commentary,the pulitzer prize board this week awarded its...,right-wing,"[pulitz, prize, board, week, award, commentari...","[pulitzer, prize, board, week, awarded, commen...","pulitzer, prize, board, week, awarded, comment...","pulitz, prize, board, week, award, commentari,...","pulitzer, prize, board, week, awarded, comment..."
2,"""1983"" and ""The Brink"" Review: The Most Danger...","Lee Edwards, Ph.D.",2018-08-10,defense,https://www.heritage.org/defense/commentary/19...,Heritage Commentary,most historians agree that the world came clos...,right-wing,"[historian, agre, world, came, closest, nuclea...","[historians, agree, world, came, closest, nucl...","historians, agree, world, came, closest, nucle...","historian, agre, world, came, closest, nuclear...","historians, agree, world, came, closest, nucle..."
3,"""60 Minutes"" Snubs the Facts on Education",Jonathan Butcher,2018-03-13,education,https://www.heritage.org/education/commentary/...,Heritage Commentary,beth richardson is committed to her son’s succ...,right-wing,"[beth, richardson, commit, son, success, expec...","[beth, richardson, committed, son, success, ex...","beth, richardson, committed, son, success, exp...","beth, richardson, commit, son, success, expect...","beth, richardson, committed, son, success, exp..."
4,"""As Israel and the Arabs Battle, Moscow Collec...",James Phillips,1983-09-20,europe,https://www.heritage.org/europe/report/israel-...,Heritage Report,"i i 291 september 20, 1983 as israel and the a...",right-wing,"[septemb, israel, arab, ban4, moscow, collect,...","[september, israel, arabs, ban4, moscow, colle...","september, israel, arabs, ban4, moscow, collec...","septemb, israel, arab, ban4, moscow, collect, ...","september, israel, arabs, ban4, moscow, collec..."


In [6]:
x_text = full_corpus.text_lem
y_bias = full_corpus.art_bias

x_train, x_test, y_train, y_test = train_test_split(
    x_text, y_bias,
    test_size=.2,
    random_state=42
)


In [None]:
nb_clf = Pipeline([
    ('vect', CountVectorizer(max_features=200)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [None]:
nb_clf.fit(x_train, y_train)

In [None]:
predicted = nb_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

In [None]:
coef_values = pd.DataFrame(nb_clf[2].coef_, columns=nb_clf[0].get_feature_names_out())

In [None]:
nb_clf.get_params()

In [None]:
nb_clf[0].get_feature_names_out()

In [8]:
log_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ('log_clf', LogisticRegression(penalty="l1")),
])

log_param_grid = {
    "vect__ngram_range": [(1, 1), (1, 2)],
    "vect__max_features": [200, 300, 400],
    "pca__n_components": [5, 15, 30, 45, 60],
    "log_clf__C": np.logspace(-4, 4, 4),
}

log_search = GridSearchCV(log_clf, log_param_grid, cv=5, n_jobs=-1)

In [9]:
log_search.fit(x_train, y_train)

In [6]:
random_search = RandomizedSearchCV(
    estimator=log_clf,
    param_distributions=log_param_grid,
    n_iter=40,
    random_state=42,
    cv=5,
    n_jobs=-1,
    verbose=10,
)

print("Performing grid search...")
print("Hyperparameters to be evaluated:")
pprint(log_param_grid)

Performing grid search...
Hyperparameters to be evaluated:
{'log_clf__C': array([1.00000000e-04, 4.64158883e-02, 2.15443469e+01, 1.00000000e+04]),
 'pca__n_components': [5, 15, 30, 45, 60],
 'vect__max_features': [200, 300, 400],
 'vect__ngram_range': [(1, 1), (1, 2)]}


In [None]:
from time import time

t0 = time()
random_search.fit(x_train, y_train)
print(f"Done in {time() - t0:.3f}s")

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5; 1/40] START log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, vect__ngram_range=(1, 1)
[CV 1/5; 1/40] END log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, vect__ngram_range=(1, 1);, score=nan total time=   8.3s
[CV 2/5; 1/40] START log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, vect__ngram_range=(1, 1)
[CV 2/5; 1/40] END log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, vect__ngram_range=(1, 1);, score=nan total time=   8.0s
[CV 3/5; 1/40] START log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, vect__ngram_range=(1, 1)
[CV 3/5; 1/40] END log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, vect__ngram_range=(1, 1);, score=nan total time=   8.1s
[CV 4/5; 1/40] START log_clf__C=0.046415888336127774, pca__n_components=30, vect__max_features=300, v

[CV 1/5; 7/40] END log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2);, score=nan total time=  48.5s
[CV 2/5; 7/40] START log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2)
[CV 2/5; 7/40] END log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2);, score=nan total time=  47.3s
[CV 3/5; 7/40] START log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2)
[CV 3/5; 7/40] END log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2);, score=nan total time=  48.9s
[CV 4/5; 7/40] START log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2)
[CV 4/5; 7/40] END log_clf__C=21.54434690031882, pca__n_components=30, vect__max_features=200, vect__ngram_range=(1, 2);, score=nan total time=  48.2s
[CV 5/5; 7/40] START log_clf__

In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)