In [1]:
import pandas as pd
import scripts.data_pull as dp
import numpy as np

In [None]:
full_corpus = dp.data_pull("combined_corpus")

In [None]:
x_text = full_corpus.text
y_bias = full_corpus.art_bias

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_text, y_bias,
    test_size=.2,
    random_state=42
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english',
                             max_features=200)),
    ('clf', LogisticRegression()),
])

In [None]:
text_clf.fit(x_train, y_train)

In [None]:
predicted = text_clf.predict(x_test)
np.mean(predicted == y_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

In [None]:
metrics.confusion_matrix(y_test, predicted)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__max_features': (None, 200)
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [None]:
gs_clf.fit(x_train, y_train)

In [None]:
gs_predicted = gs_clf.predict(x_test)
np.mean(gs_predicted == y_test)

In [None]:
gs_results = pd.DataFrame(gs_clf.cv_results_)

In [None]:
gs_results

In [None]:
metrics.confusion_matrix(y_test, gs_predicted)

In [None]:
coef_values = pd.DataFrame(text_clf[1].coef_,columns=text_clf[0].get_feature_names_out())