In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics

import scripts.blosc_interface as bi
import scripts.corpus_split as cs

https://stackoverflow.com/questions/26826002/adding-words-to-stop-words-list-in-tfidfvectorizer-in-sklearn
https://stackoverflow.com/questions/50756085/how-to-print-the-progress-of-a-list-comprehension-in-python
https://stackoverflow.com/questions/57983431/whats-the-most-space-efficient-way-to-compress-serialized-python-data

In [2]:
full_corpus = bi.blosc_read("./data/tokenized_corpus.dat")

In [3]:
x_train, x_test, y_train, y_test = cs.corpus_split(full_corpus)


Splitting the Data into Train and Test


In [4]:
log_reg_cv_results = bi.blosc_read("./data/log_grid_search_result.dat")
log_reg_top5 = log_reg_cv_results.query("rank_test_score <= 5")
log_reg_top5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__ngram_range,param_vect__max_features,param_svd__n_components,param_log_clf__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,64.123026,3.714061,6.076129,1.974486,"(1, 2)",400,30,10000.0,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.848319,0.839127,0.827636,0.83769,0.850043,0.840563,0.008093,5
14,58.169902,0.920133,4.012292,0.1958,"(1, 2)",400,60,21.544347,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.867567,0.857512,0.852054,0.846883,0.8736,0.859523,0.009823,1
18,12.827075,0.430348,2.242276,0.20586,"(1, 1)",400,60,21.544347,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.863832,0.851767,0.846309,0.844872,0.865843,0.854525,0.008752,3
35,11.156855,0.467028,2.069154,0.159691,"(1, 1)",300,45,10000.0,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.852054,0.844585,0.839989,0.833668,0.859523,0.845964,0.009052,4
38,12.062435,0.280006,2.068953,0.173405,"(1, 1)",400,60,10000.0,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.864694,0.854352,0.850618,0.846596,0.86728,0.856708,0.008005,2


In [5]:
from sklearn.linear_model import LogisticRegression
# Loading Up the Same Pipeline as Random Search
# Updated Values to Optimized
log_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ("svd", TruncatedSVD()),
            ('log_clf', LogisticRegression())
        ])

# Fitting the Optimized Log Reg Model
log_clf.fit(x_train, y_train)

# Getting Predictions on the testset
predicted = log_clf.predict(x_test)
np.mean(predicted == y_test)

# Printing the results
print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)


              precision    recall  f1-score   support

   left-wing       0.55      0.37      0.44      2097
  right-wing       0.55      0.72      0.62      2255

    accuracy                           0.55      4352
   macro avg       0.55      0.55      0.53      4352
weighted avg       0.55      0.55      0.54      4352



array([[ 777, 1320],
       [ 630, 1625]])

In [6]:
# Loading Up the Same Pipeline as Random Search
# Updated Values to Optimized
log_opt_clf = Pipeline([
            ('vect', CountVectorizer(max_features=400, ngram_range=(1,2))),
            ('tfidf', TfidfTransformer()),
            ("svd", TruncatedSVD(n_components=60)),
            ('log_clf', LogisticRegression(penalty="l1", solver = 'saga', C=21.544347, max_iter=1000)), # this needs to be a different solver for LASSO
        ])

# Fitting the Optimized Log Reg Model
log_opt_clf.fit(x_train, y_train)

# Getting Predictions on the testset
predicted = log_opt_clf.predict(x_test)
np.mean(predicted == y_test)

# Printing the results
print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.86      0.84      0.85      2097
  right-wing       0.86      0.88      0.87      2255

    accuracy                           0.86      4352
   macro avg       0.86      0.86      0.86      4352
weighted avg       0.86      0.86      0.86      4352



array([[1762,  335],
       [ 278, 1977]])

https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html#sphx-glr-auto-examples-compose-plot-digits-pipe-py

In [7]:
# Taken from Sklearn's Auto Examples
# Plot the Truncated SVD spectrum
import matplotlib.pyplot as plt

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(
    np.arange(1, log_clf[2].n_components + 1), log_clf[2].explained_variance_ratio_, "+", linewidth=2
)
ax0.set_ylabel("PCA explained variance ratio")

ax0.axvline(
    log_clf[2].n_components,
    linestyle=":",
    label="n_components chosen",
)
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = log_reg_cv_results
components_col = "param_svd__n_components"
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, "mean_test_score")
)

best_clfs.plot(
    x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
)
ax1.set_ylabel("Classification accuracy (val)")
ax1.set_xlabel("n_components")

plt.xlim(-1, 70)

plt.tight_layout()
plt.show()


In [8]:
svc_reg_cv_results = bi.blosc_read("./data/svc_grid_search_result.dat")
svc_top5 = svc_reg_cv_results.query("rank_test_score <= 5")
svc_top5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__ngram_range,param_vect__max_features,param_svd__n_components,param_svc_clf__penalty,param_svc_clf__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,9.538747,0.252239,1.554701,0.083288,"(1, 1)",400,45,l2,21.544347,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.857799,0.847745,0.841712,0.840563,0.862396,0.850043,0.008687,2
12,8.757007,0.317715,1.550521,0.073838,"(1, 1)",200,60,l2,0.046416,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.846021,0.832232,0.829934,0.826774,0.843149,0.835622,0.007575,5
14,40.221779,1.873627,3.265838,0.243435,"(1, 2)",400,45,l2,0.046416,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.860672,0.847745,0.841999,0.84085,0.860385,0.85033,0.008649,1
24,10.407523,0.389623,1.543918,0.103774,"(1, 1)",300,60,l2,21.544347,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.853778,0.84717,0.849756,0.838839,0.860672,0.850043,0.007222,2
39,42.016662,1.504122,2.994632,0.116347,"(1, 2)",200,60,l2,21.544347,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.842861,0.833668,0.832519,0.822465,0.848607,0.836024,0.009019,4


In [9]:
from sklearn.svm import LinearSVC

svc_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ("svd", TruncatedSVD()),
            ('svc_clf', LinearSVC()),
        ])

svc_clf.fit(x_train, y_train)

predicted = svc_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.55      0.37      0.45      2097
  right-wing       0.55      0.72      0.62      2255

    accuracy                           0.55      4352
   macro avg       0.55      0.55      0.54      4352
weighted avg       0.55      0.55      0.54      4352



array([[ 783, 1314],
       [ 633, 1622]])

In [12]:
svc_opt_clf = Pipeline([
            ('vect', CountVectorizer(max_features=400, ngram_range=(1,2))),
            ('tfidf', TfidfTransformer()),
            ("svd", TruncatedSVD(n_components=45)),
            ('svc_clf', LinearSVC(penalty = "l2", C = 0.046415)),
        ])

svc_opt_clf.fit(x_train, y_train)

predicted = svc_opt_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.86      0.83      0.85      2097
  right-wing       0.85      0.88      0.86      2255

    accuracy                           0.85      4352
   macro avg       0.85      0.85      0.85      4352
weighted avg       0.85      0.85      0.85      4352



array([[1736,  361],
       [ 274, 1981]])

In [13]:
knn_reg_cv_results = bi.blosc_read("./data/knn_grid_search_result.dat")
knn_top5 = knn_reg_cv_results.query("rank_test_score <= 5")
knn_top5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__ngram_range,param_vect__max_features,param_svd__n_components,param_knn_clf__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,7.708155,0.123026,2.061593,0.06522,"(1, 1)",400,60,5,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.858661,0.861247,0.856076,0.850618,0.863545,0.858029,0.00447,1
7,38.017391,0.497011,3.561517,0.061605,"(1, 2)",300,45,6,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.845734,0.855214,0.853203,0.838552,0.848319,0.848205,0.005889,4
10,8.183979,0.187956,2.308006,0.287278,"(1, 1)",300,45,5,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.849181,0.852054,0.853778,0.843149,0.845159,0.848664,0.004015,3
27,38.280288,0.23509,3.726631,0.112006,"(1, 2)",400,45,6,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.846021,0.851767,0.858949,0.851767,0.856938,0.853088,0.00453,2
35,8.004029,0.397554,2.160517,0.036752,"(1, 1)",300,60,6,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.852629,0.849469,0.852341,0.836541,0.842861,0.846768,0.006206,5


In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ("svd", TruncatedSVD()),
        ('knn_clf', KNeighborsClassifier()),
    ])

knn_clf.fit(x_train, y_train)

predicted = knn_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.54      0.52      0.53      2097
  right-wing       0.57      0.59      0.58      2255

    accuracy                           0.56      4352
   macro avg       0.55      0.55      0.55      4352
weighted avg       0.55      0.56      0.55      4352



array([[1092, 1005],
       [ 930, 1325]])

In [15]:
knn_opt_clf = Pipeline([
        ('vect', CountVectorizer(max_features=400, ngram_range=(1,1))),
        ('tfidf', TfidfTransformer()),
        ("svd", TruncatedSVD(n_components=60)),
        ('knn_clf', KNeighborsClassifier(n_neighbors = 5)),
    ])

knn_opt_clf.fit(x_train, y_train)

predicted = knn_opt_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.87      0.82      0.85      2097
  right-wing       0.84      0.89      0.87      2255

    accuracy                           0.86      4352
   macro avg       0.86      0.86      0.86      4352
weighted avg       0.86      0.86      0.86      4352



array([[1724,  373],
       [ 248, 2007]])

In [16]:
forest_cv_results = bi.blosc_read("./data/forest_grid_search_result.dat")
forest_top5 = forest_cv_results.query("rank_test_score <= 5")
forest_top5

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__ngram_range,param_vect__max_features,param_svd__n_components,param_forest_clf__max_features,param_forest_clf__max_depth,param_forest_clf__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,44.538229,0.430227,3.097426,0.044942,"(1, 2)",400,60,log2,15,entropy,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.874749,0.860959,0.860959,0.851479,0.871876,0.864005,0.0084,3
11,11.257121,0.414312,1.563059,0.056807,"(1, 1)",400,60,log2,15,gini,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.873025,0.863258,0.859523,0.851192,0.868716,0.863143,0.007545,5
25,44.256173,0.485781,3.049271,0.055533,"(1, 2)",400,60,log2,35,entropy,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.871014,0.85981,0.862396,0.85665,0.87992,0.865958,0.008457,1
29,43.861682,0.323839,3.080161,0.09397,"(1, 2)",400,30,sqrt,25,entropy,"{'vect__ngram_range': (1, 2), 'vect__max_featu...",0.869578,0.859236,0.862683,0.856076,0.872738,0.864062,0.00624,2
32,12.100364,0.267698,1.614859,0.105144,"(1, 1)",400,45,sqrt,15,gini,"{'vect__ngram_range': (1, 1), 'vect__max_featu...",0.870152,0.860098,0.855214,0.856938,0.874174,0.863315,0.007501,4


In [17]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ("svd", TruncatedSVD()),
        ('forest_clf', RandomForestClassifier()),
    ])

forest_clf.fit(x_train, y_train)

predicted = forest_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.55      0.54      0.54      2097
  right-wing       0.58      0.58      0.58      2255

    accuracy                           0.56      4352
   macro avg       0.56      0.56      0.56      4352
weighted avg       0.56      0.56      0.56      4352



array([[1142,  955],
       [ 952, 1303]])

In [19]:
forest_opt_clf = Pipeline([
        ('vect', CountVectorizer(max_features=400, ngram_range=(1,2))),
        ('tfidf', TfidfTransformer()),
        ("svd", TruncatedSVD(n_components=60)),
        ('forest_clf', RandomForestClassifier(max_features="log2",max_depth=35,
        criterion="entropy")), 
    ])

forest_opt_clf.fit(x_train, y_train)

predicted = forest_opt_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

              precision    recall  f1-score   support

   left-wing       0.88      0.86      0.87      2097
  right-wing       0.87      0.89      0.88      2255

    accuracy                           0.88      4352
   macro avg       0.88      0.88      0.88      4352
weighted avg       0.88      0.88      0.88      4352



array([[1796,  301],
       [ 239, 2016]])