In [1]:
# Reference: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# Following work I submitted in HW3 

In [13]:
import pandas as pd
import numpy as np
import matplotlib
import datetime
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

## Load data

In [3]:
# load data
train = pd.read_csv("../data/svm_data/train.csv")
test = pd.read_csv("../data/svm_data/test.csv")

In [6]:
X_train = train['defn']
y_train = train['cat']
X_test = test['defn']
y_test = test['cat']

## Logistic comparison loop

In [10]:
# test combinations of: min_df, max_df, lower for tokenizer
results = []
for max_df in [1.0, .1, .01]:
    for min_df in [1, 3, 5]:
        for lower in [True, False]:
            print("Working on: max_df = " + str(max_df) + " min_df = " + str(min_df) + " lower = " + str(lower))
            # raw counts
            # to print time: https://www.geeksforgeeks.org/get-current-date-and-time-using-python/
            print("Count vectorizing at " + str(datetime.datetime.now()))
            cv = CountVectorizer(max_df = max_df, min_df = min_df, lowercase = lower)
            X_train_counts = cv.fit_transform(X_train)
            X_test_counts = cv.transform(X_test)
            # record vocab size for tracking
            vocab_size = X_train_counts.shape[1]
            # tf-idf version
            print("TF-IDF vectorizing at " + str(datetime.datetime.now()))
            tv = TfidfTransformer()
            X_train_tfidf = tv.fit_transform(X_train_counts)
            X_test_tfidf = tv.transform(X_test_counts)


            # fit logistic regression on tf-idf version
            print("Fitting TF-IDF count model at " + str(datetime.datetime.now()))
            model = LogisticRegression(random_state = 771, max_iter = 1000)
            model.fit(X_train_tfidf, y_train)

            # evaluate 
            print("Evaluating TF-IDF model at " + str(datetime.datetime.now()))
            tf_y_train_pred = model.predict(X_train_tfidf)
            tf_y_test_pred = model.predict(X_test_tfidf)

            tf_train_f1_micro = f1_score(y_train, tf_y_train_pred, average = 'micro')
            tf_train_f1_macro = f1_score(y_train, tf_y_train_pred, average = 'macro')

            tf_test_f1_micro = f1_score(y_test, tf_y_test_pred, average = 'micro')
            tf_test_f1_macro = f1_score(y_test, tf_y_test_pred, average = 'macro')

            # record
            curr_tfidf_results = ("tfidf", max_df, min_df, lower, vocab_size, tf_train_f1_micro, 
                            tf_train_f1_macro, tf_test_f1_micro, tf_test_f1_macro)
            print(curr_tfidf_results)
            results.append(curr_tfidf_results)
            print("--------------------------------------------------------")
                
                
full_results = pd.DataFrame(results, columns = ["type", "max_df", "min_df", "lower", "vocab_size",
                                               "train_micro_f1", "train_macro_f1", "test_micro_f1", "test_macro_f1"])
full_results.to_csv("../models/logistic_results.csv", index = False)

Working on: max_df = 1.0 min_df = 1 lower = True
Count vectorizing at 2020-11-22 12:44:36.390843
TF-IDF vectorizing at 2020-11-22 12:44:56.463881
Fitting TF-IDF count SVM at 2020-11-22 12:44:58.065212
Evaluating TF-IDF SVM at 2020-11-22 12:53:15.500618
('tfidf', 1.0, 1, True, 314448, 0.551372681132607, 0.5512513477651412, 0.4040835261695817, 0.4038787504594819)
--------------------------------------------------------
Working on: max_df = 1.0 min_df = 1 lower = False
Count vectorizing at 2020-11-22 12:53:26.104429
TF-IDF vectorizing at 2020-11-22 12:53:47.326347
Fitting TF-IDF count SVM at 2020-11-22 12:53:49.165296
Evaluating TF-IDF SVM at 2020-11-22 13:01:16.730051
('tfidf', 1.0, 1, False, 403015, 0.572992892212361, 0.5728954083306359, 0.40474073419525153, 0.4046529784400566)
--------------------------------------------------------
Working on: max_df = 1.0 min_df = 3 lower = True
Count vectorizing at 2020-11-22 13:01:27.576179
TF-IDF vectorizing at 2020-11-22 13:01:46.428601
Fitting T

In [38]:
full_results['of'] = full_results['train_micro_f1'] - full_results['test_micro_f1']

In [39]:
full_results.sort_values(by = "of", ascending = False)

Unnamed: 0,type,max_df,min_df,lower,vocab_size,train_micro_f1,train_macro_f1,test_micro_f1,test_macro_f1,of
13,tfidf,0.01,1,False,402745,0.583787,0.583717,0.400626,0.400641,0.183161
7,tfidf,0.1,1,False,402993,0.575869,0.575789,0.404039,0.40401,0.17183
1,tfidf,1.0,1,False,403015,0.572993,0.572895,0.404741,0.404653,0.168252
12,tfidf,0.01,1,True,314181,0.56084,0.560731,0.399792,0.399652,0.161048
6,tfidf,0.1,1,True,314425,0.554127,0.554027,0.40389,0.403711,0.150237
0,tfidf,1.0,1,True,314448,0.551373,0.551251,0.404084,0.403879,0.147289
15,tfidf,0.01,3,False,112898,0.527651,0.527552,0.399489,0.399392,0.128162
9,tfidf,0.1,3,False,113146,0.525875,0.525757,0.403918,0.40379,0.121958
3,tfidf,1.0,3,False,113168,0.525105,0.524962,0.404315,0.404164,0.12079
17,tfidf,0.01,5,False,78756,0.512861,0.512732,0.399235,0.399116,0.113627


In [14]:
# best params are 1.0, 1, false
cv = CountVectorizer(max_df = 1.0, min_df = 1, lowercase = False)
X_train_counts = cv.fit_transform(X_train)
X_test_counts = cv.transform(X_test)

# tf-idf version
print("TF-IDF vectorizing at " + str(datetime.datetime.now()))
tv = TfidfTransformer()
X_train_tfidf = tv.fit_transform(X_train_counts)
X_test_tfidf = tv.transform(X_test_counts)


# fit logistic regression on tf-idf version
print("Fitting TF-IDF count model at " + str(datetime.datetime.now()))
model = LogisticRegression(random_state = 771, max_iter = 1000)
model.fit(X_train_tfidf, y_train)

# evaluate 
print("Evaluating TF-IDF model at " + str(datetime.datetime.now()))
tf_y_train_pred = model.predict(X_train_tfidf)
tf_y_test_pred = model.predict(X_test_tfidf)

tf_train_f1_micro = f1_score(y_train, tf_y_train_pred, average = 'micro')
tf_train_f1_macro = f1_score(y_train, tf_y_train_pred, average = 'macro')

tf_test_f1_micro = f1_score(y_test, tf_y_test_pred, average = 'micro')
tf_test_f1_macro = f1_score(y_test, tf_y_test_pred, average = 'macro')

with open('../models/best_logistic.p', 'wb') as f:
    pickle.dump(model, f)



TF-IDF vectorizing at 2020-11-22 18:12:16.869536
Fitting TF-IDF count model at 2020-11-22 18:12:18.981698
Evaluating TF-IDF model at 2020-11-22 18:20:08.319788


In [23]:
with open('../models/logistic_cv.p', 'wb') as f:
    pickle.dump(cv, f) 

with open('../models/logistic_tfidf.p', 'wb') as f:
    pickle.dump(tv, f)

## Look at accuracy by class

In [15]:
by_class = pd.DataFrame()
by_class["pred"] = tf_y_test_pred
by_class["true"] = y_test

In [16]:
by_class["correct"] = by_class["pred"] == by_class["true"]

In [18]:
# https://stackoverflow.com/questions/23377108/pandas-percentage-of-total-with-groupby
by_class.groupby(["true", "correct"]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

true           correct
bad            False      55.153074
               True       44.846926
controversial  False      65.569661
               True       34.430339
good           False      57.604056
               True       42.395944
dtype: float64

## Evaluate on the BERT test set

In [42]:
bert_test = pd.read_csv("../data/final_eval_set.csv", index_col = 0)

In [43]:
bert_test.rename(columns = {'pred':'bert_pred', 'match':'bert_match'}, inplace = True)

In [44]:
bert_test.head()

Unnamed: 0,bert_pred,true,text,bert_match
0,0,0,"One nickname for the city of Columbus, Ohio. C...",True
1,1,1,-adjective ;; used to denote an ignorant or fo...,True
2,2,0,someone that is really stupid.,False
3,2,2,Nasty pieces of poopoo that just will not come...,True
4,0,2,A hardedend clump of fecal matter attached to ...,False


In [45]:
int_cats = {"good": 0, "controversial": 1, "bad": 2}

In [47]:
X_bert_test = tv.transform(cv.transform(bert_test['text']))
y_pred_bert_test = model.predict(X_bert_test)
bert_test['log_pred'] = pd.Series(y_pred_bert_test).map(int_cats)
bert_test['log_match'] = bert_test['log_pred'] == bert_test['true']

In [48]:
bert_test.groupby('log_match').size().transform(lambda x: round(x/sum(x),3))

log_match
False    0.6
True     0.4
dtype: float64

In [50]:
bert_test.head()

Unnamed: 0,bert_pred,true,text,bert_match,log_pred,log_match
0,0,0,"One nickname for the city of Columbus, Ohio. C...",True,1,False
1,1,1,-adjective ;; used to denote an ignorant or fo...,True,1,True
2,2,0,someone that is really stupid.,False,2,False
3,2,2,Nasty pieces of poopoo that just will not come...,True,2,True
4,0,2,A hardedend clump of fecal matter attached to ...,False,2,True


In [49]:
bert_test.to_csv("../data/final_eval_set.csv")

## Example prediction 

In [39]:
tf_y_test_pred

array(['bad', 'controversial', 'controversial', ..., 'controversial',
       'good', 'bad'], dtype=object)

In [36]:
def pred_ex(text, count_vectorizer, tfidf_vectorizer, model):
    text = tfidf_vectorizer.transform(count_vectorizer.transform([text]))
    pred = model.predict(text)
    print(pred[0])


In [41]:
pred_ex("this could be a very good definition", cv, tv, svm_tf)
pred_ex("nonsense nonsense yada yada", cv, tv, svm_tf)
pred_ex("this is an example of a definition with interesting words and content like a car and a house", cv, tv, svm_tf)
pred_ex("merrily we roll along", cv, tv, svm_tf)
pred_ex("rudolph the red-nosed reindeer had a very shiny nose", cv, tv, svm_tf)

bad
bad
bad
controversial
bad
