In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics

import scripts.blosc_interface as bi
import scripts.corpus_split as cs

https://stackoverflow.com/questions/26826002/adding-words-to-stop-words-list-in-tfidfvectorizer-in-sklearn
https://stackoverflow.com/questions/50756085/how-to-print-the-progress-of-a-list-comprehension-in-python
https://stackoverflow.com/questions/57983431/whats-the-most-space-efficient-way-to-compress-serialized-python-data

In [2]:
full_corpus = bi.blosc_read("./data/tokenized_corpus.dat")

: 

: 

In [None]:
x_train, x_test, y_train, y_test = cs.corpus_split(full_corpus)

In [None]:
log_reg_cv_results = bi.blosc_read("./data/log_grid_search_result.dat")
log_reg_cv_results.query("rank_test_score <= 5")

In [None]:
from sklearn.linear_model import LogisticRegression

log_clf = Pipeline([
            ('vect', CountVectorizer(max_features=400, ngram_range=(1,2))),
            ('tfidf', TfidfTransformer()),
            ("svd", TruncatedSVD(n_components=60)),
            ('log_clf', LogisticRegression(penalty="l1", solver = 'saga', C=21.544347, max_iter=1000)), # this needs to be a different solver for LASSO
        ])



In [None]:
log_clf.fit(x_train, y_train)

predicted = log_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

In [None]:
log_clf[2].n_features_in_

https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html#sphx-glr-auto-examples-compose-plot-digits-pipe-py

In [None]:
# Taken from Sklearn's Auto Examples
# Plot the Truncated SVD spectrum
import matplotlib.pyplot as plt

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(
    np.arange(1, log_clf[2].n_components + 1), log_clf[2].explained_variance_ratio_, "+", linewidth=2
)
ax0.set_ylabel("PCA explained variance ratio")

ax0.axvline(
    log_clf[2].n_components,
    linestyle=":",
    label="n_components chosen",
)
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = log_reg_cv_results
components_col = "param_svd__n_components"
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, "mean_test_score")
)

best_clfs.plot(
    x=components_col, y="mean_test_score", yerr="std_test_score", legend=False, ax=ax1
)
ax1.set_ylabel("Classification accuracy (val)")
ax1.set_xlabel("n_components")

plt.xlim(-1, 70)

plt.tight_layout()
plt.show()


In [None]:
svc_reg_cv_results = bi.blosc_read("./data/svc_grid_search_result.dat")
svc_reg_cv_results.query("rank_test_score <= 5")

In [None]:
from sklearn.svm import LinearSVC

svc_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ("svd", TruncatedSVD()),
            ('svc_clf', LinearSVC()),
        ])

In [None]:
svc_clf.fit(x_train, y_train)

predicted = svc_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)

In [None]:
knn_reg_cv_results = bi.blosc_read("./data/knn_grid_search_result.dat")
knn_reg_cv_results.query("rank_test_score <= 5")

In [None]:
knn_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ("svd", TruncatedSVD()),
        ('knn_clf', KNeighborsClassifier()), # this needs to be a different solver for LASSO
    ])

In [None]:
knn_clf.fit(x_train, y_train)

predicted = knn_clf.predict(x_test)
np.mean(predicted == y_test)

print(metrics.classification_report(y_test, predicted))

metrics.confusion_matrix(y_test, predicted)