In [1]:
# Following: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# Adapted from homework 3 for Text Analytics course

In [46]:
import pandas as pd
import numpy as np
import pickle
import matplotlib
import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

## Load and split data

In [3]:
# load data
data = pd.read_csv('../data/clean_data.csv')

In [4]:
data.columns

Index(['word_id', 'word', 'up_votes', 'down_votes', 'author', 'definition',
       'total_votes', 'good_ratio', 'cat', 'char_len', 'word_len',
       'word_len_bucket'],
      dtype='object')

In [5]:
# split into train/test - need to use tokenizer trained on train only
defn = data[['definition','cat']]
train = defn.sample(frac = .8, random_state = 771)
test = defn[~defn.index.isin(train.index)]
X_train = train['definition']
y_train = train['cat']
X_test = test['definition']
y_test = test['cat']

## Examine raw counts
Get a sense of word frequencies

In [6]:
# first examine the raw counts
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X_train)

In [7]:
# follow: https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
# to get most common words
# sum_words = X_train_counts.sum(axis=0)
# words_freq = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]

In [8]:
# X_train_counts.shape[1]

In [9]:
# len(train)

In [10]:
# freq = pd.DataFrame(words_freq, columns = ["word","freq"])

In [11]:
# freq.sort_values(by = "freq", ascending = False).head(50)

## SVM comparison loop

In [13]:
# test combinations of: min_df, max_df, lower for tokenizer
results = []
for max_df in [.1, .05, .025, .01]:
    for min_df in [3, 5]:
        for lower in [True, False]:
            for dual in [True, False]:
                print("Working on: max_df = " + str(max_df) + " min_df = " + str(min_df) + " lower = " + str(lower) + " dual = " + str(dual))
                # raw counts
                # to print time: https://www.geeksforgeeks.org/get-current-date-and-time-using-python/
                print("Count vectorizing at " + str(datetime.datetime.now()))
                cv = CountVectorizer(max_df = max_df, min_df = min_df, lowercase = lower)
                X_train_counts = cv.fit_transform(X_train)
                X_test_counts = cv.transform(X_test)
                
                # record vocab size for tracking
                vocab_size = X_train_counts.shape[1]
                
                # tf-idf version
                print("TF-IDF vectorizing at " + str(datetime.datetime.now()))
                tv = TfidfTransformer()
                X_train_tfidf = tv.fit_transform(X_train_counts)
                X_test_tfidf = tv.transform(X_test_counts)
                
                # fit an SVM on tf-idf version
                print("Fitting TF-IDF count SVM at " + str(datetime.datetime.now()))
                svm_tf = LinearSVC(dual = dual, random_state = 771, max_iter = 5000)
                svm_tf.fit(X_train_tfidf, y_train)
                
                # evaluate 
                print("Evaluating TF-IDF SVM at " + str(datetime.datetime.now()))
                tf_y_train_pred = svm_tf.predict(X_train_tfidf)
                tf_y_test_pred = svm_tf.predict(X_test_tfidf)
                
                tf_train_f1_micro = f1_score(y_train, tf_y_train_pred, average = 'micro')
                tf_train_f1_macro = f1_score(y_train, tf_y_train_pred, average = 'macro')
                
                tf_test_f1_micro = f1_score(y_test, tf_y_test_pred, average = 'micro')
                tf_test_f1_macro = f1_score(y_test, tf_y_test_pred, average = 'macro')
                
                # record
                curr_tfidf_results = ("tfidf", max_df, min_df, lower, dual, vocab_size, tf_train_f1_micro, 
                                tf_train_f1_macro, tf_test_f1_micro, tf_test_f1_macro)
                print(curr_tfidf_results)
                results.append(curr_tfidf_results)
                print("--------------------------------------------------------")
                
                
full_results = pd.DataFrame(results, columns = ["type", "max_df", "min_df", "lower", "dual", "vocab_size",
                                               "train_micro_f1", "train_macro_f1", "test_micro_f1", "test_macro_f1"])
full_results.to_csv("../models/svm_results.csv", index = False)

Working on: max_df = 0.1 min_df = 3 lower = True dual = True
Count vectorizing at 2020-11-20 20:49:57.796538
TF-IDF vectorizing at 2020-11-20 20:50:16.362684
Fitting TF-IDF count SVM at 2020-11-20 20:50:17.240393
Evaluating TF-IDF SVM at 2020-11-20 20:51:01.483598
('tfidf', 0.1, 3, True, True, 89202, 0.5303295981090081, 0.5300672188390038, 0.39561161767061176, 0.39517904621245187)
--------------------------------------------------------
Working on: max_df = 0.1 min_df = 3 lower = True dual = False
Count vectorizing at 2020-11-20 20:51:12.356171
TF-IDF vectorizing at 2020-11-20 20:51:29.324423
Fitting TF-IDF count SVM at 2020-11-20 20:51:30.224547
Evaluating TF-IDF SVM at 2020-11-20 20:53:17.414606
('tfidf', 0.1, 3, True, False, 89202, 0.5302964615698987, 0.5300339871642753, 0.3956337086966847, 0.3952014867151332)
--------------------------------------------------------
Working on: max_df = 0.1 min_df = 3 lower = False dual = True
Count vectorizing at 2020-11-20 20:53:28.668812
TF-IDF v

('tfidf', 0.025, 3, False, True, 113063, 0.552522795177529, 0.5523998047247449, 0.393938222445587, 0.39363843987049024)
--------------------------------------------------------
Working on: max_df = 0.025 min_df = 3 lower = False dual = False
Count vectorizing at 2020-11-20 21:30:47.625920
TF-IDF vectorizing at 2020-11-20 21:31:05.082967
Fitting TF-IDF count SVM at 2020-11-20 21:31:05.832623
Evaluating TF-IDF SVM at 2020-11-20 21:32:17.237671
('tfidf', 0.025, 3, False, False, 113063, 0.5525214144883994, 0.5523994603646217, 0.3939161314195141, 0.3936170129318292)
--------------------------------------------------------
Working on: max_df = 0.025 min_df = 5 lower = True dual = True
Count vectorizing at 2020-11-20 21:32:28.727224
TF-IDF vectorizing at 2020-11-20 21:32:45.673848
Fitting TF-IDF count SVM at 2020-11-20 21:32:46.377317
Evaluating TF-IDF SVM at 2020-11-20 21:33:25.856149
('tfidf', 0.025, 5, True, True, 62881, 0.5086914380705698, 0.5083425597251457, 0.3952360702273719, 0.3946910

In [14]:
full_results.head()

Unnamed: 0,type,max_df,min_df,lower,dual,vocab_size,train_micro_f1,train_macro_f1,test_micro_f1,test_macro_f1
0,tfidf,0.1,3,True,True,89202,0.53033,0.530067,0.395612,0.395179
1,tfidf,0.1,3,True,False,89202,0.530296,0.530034,0.395634,0.395201
2,tfidf,0.1,3,False,True,113146,0.55269,0.552546,0.394927,0.394605
3,tfidf,0.1,3,False,False,113146,0.552614,0.552471,0.394988,0.394668
4,tfidf,0.1,5,True,True,62963,0.509327,0.508951,0.396782,0.39619


In [15]:
full_results.sort_values(by = "test_micro_f1", ascending = False)

Unnamed: 0,type,max_df,min_df,lower,dual,vocab_size,train_micro_f1,train_macro_f1,test_micro_f1,test_macro_f1
4,tfidf,0.1,5,True,True,62963,0.509327,0.508951,0.396782,0.39619
5,tfidf,0.1,5,True,False,62963,0.509204,0.508826,0.396749,0.396156
13,tfidf,0.05,5,True,False,62937,0.509095,0.508735,0.396004,0.395433
12,tfidf,0.05,5,True,True,62937,0.509113,0.508753,0.395949,0.395374
1,tfidf,0.1,3,True,False,89202,0.530296,0.530034,0.395634,0.395201
0,tfidf,0.1,3,True,True,89202,0.53033,0.530067,0.395612,0.395179
15,tfidf,0.05,5,False,False,78980,0.527481,0.527272,0.395253,0.394863
14,tfidf,0.05,5,False,True,78980,0.527454,0.527244,0.395253,0.394862
21,tfidf,0.025,5,True,False,62881,0.508709,0.508358,0.395247,0.394704
20,tfidf,0.025,5,True,True,62881,0.508691,0.508343,0.395236,0.394691


In [16]:
# add cases where we don't drop any of the most common words

# test combinations of: min_df, max_df, lower for tokenizer
results2 = []
for max_df in [1.0]:
    for min_df in [1, 3, 5]:
        for lower in [True, False]:
            for dual in [True, False]:
                print("Working on: max_df = " + str(max_df) + " min_df = " + str(min_df) + " lower = " + str(lower) + " dual = " + str(dual))
                # raw counts
                # to print time: https://www.geeksforgeeks.org/get-current-date-and-time-using-python/
                print("Count vectorizing at " + str(datetime.datetime.now()))
                cv = CountVectorizer(max_df = max_df, min_df = min_df, lowercase = lower)
                X_train_counts = cv.fit_transform(X_train)
                X_test_counts = cv.transform(X_test)
                # record vocab size for tracking
                vocab_size = X_train_counts.shape[1]
                # tf-idf version
                print("TF-IDF vectorizing at " + str(datetime.datetime.now()))
                tv = TfidfTransformer()
                X_train_tfidf = tv.fit_transform(X_train_counts)
                X_test_tfidf = tv.transform(X_test_counts)
                
                # fit an SVM on tf-idf version
                print("Fitting TF-IDF count SVM at " + str(datetime.datetime.now()))
                svm_tf = LinearSVC(dual = dual, random_state = 771, max_iter = 5000)
                svm_tf.fit(X_train_tfidf, y_train)
                
                # evaluate 
                print("Evaluating TF-IDF SVM at " + str(datetime.datetime.now()))
                tf_y_train_pred = svm_tf.predict(X_train_tfidf)
                tf_y_test_pred = svm_tf.predict(X_test_tfidf)
                
                tf_train_f1_micro = f1_score(y_train, tf_y_train_pred, average = 'micro')
                tf_train_f1_macro = f1_score(y_train, tf_y_train_pred, average = 'macro')
                
                tf_test_f1_micro = f1_score(y_test, tf_y_test_pred, average = 'micro')
                tf_test_f1_macro = f1_score(y_test, tf_y_test_pred, average = 'macro')
                
                # record
                curr_tfidf_results = ("tfidf", max_df, min_df, lower, dual, vocab_size, tf_train_f1_micro, 
                                tf_train_f1_macro, tf_test_f1_micro, tf_test_f1_macro)
                print(curr_tfidf_results)
                results2.append(curr_tfidf_results)
                print("--------------------------------------------------------")
                
full_results2 = full_results.append(pd.DataFrame(results2, columns = ["type", "max_df", "min_df", "lower", "dual", "vocab_size",
                                               "train_micro_f1", "train_macro_f1", "test_micro_f1", "test_macro_f1"]))
full_results2.to_csv("../models/svm_results_with_all_common.csv", index = False)

Working on: max_df = 1.0 min_df = 1 lower = True dual = True
Count vectorizing at 2020-11-20 22:24:29.936490
TF-IDF vectorizing at 2020-11-20 22:24:48.836587
Fitting TF-IDF count SVM at 2020-11-20 22:24:50.413288
Evaluating TF-IDF SVM at 2020-11-20 22:25:44.319926
('tfidf', 1.0, 1, True, True, 314448, 0.6183402459835753, 0.6183218188913892, 0.39657257730478435, 0.3963829330648622)
--------------------------------------------------------
Working on: max_df = 1.0 min_df = 1 lower = True dual = False
Count vectorizing at 2020-11-20 22:25:55.162515
TF-IDF vectorizing at 2020-11-20 22:26:12.331852
Fitting TF-IDF count SVM at 2020-11-20 22:26:13.783596
Evaluating TF-IDF SVM at 2020-11-20 22:29:24.483202
('tfidf', 1.0, 1, True, False, 314448, 0.61826292739232, 0.618244227019647, 0.396517349739602, 0.39632601256600736)
--------------------------------------------------------
Working on: max_df = 1.0 min_df = 1 lower = False dual = True
Count vectorizing at 2020-11-20 22:29:35.474998
TF-IDF vec

In [18]:
full_results2['test_macro_f1'].describe()

count    44.000000
mean      0.394349
std       0.001668
min       0.390676
25%       0.393933
50%       0.394729
75%       0.395371
max       0.396439
Name: test_macro_f1, dtype: float64

In [43]:
full_results2['test_micro_f1'].describe()

count    44.000000
mean      0.394744
std       0.001689
min       0.390939
25%       0.394296
50%       0.395203
75%       0.395786
max       0.397042
Name: test_micro_f1, dtype: float64

In [50]:
full_results2.sort_values(by = 'of', ascending = False)

Unnamed: 0,type,max_df,min_df,lower,dual,vocab_size,train_micro_f1,train_macro_f1,test_micro_f1,test_macro_f1,of
2,tfidf,1.0,1,False,True,403015,0.653397,0.653427,0.395418,0.395348,0.257979
3,tfidf,1.0,1,False,False,403015,0.653291,0.653321,0.395341,0.395272,0.25795
0,tfidf,1.0,1,True,True,314448,0.61834,0.618322,0.396573,0.396383,0.221768
1,tfidf,1.0,1,True,False,314448,0.618263,0.618244,0.396517,0.396326,0.221746
26,tfidf,0.01,3,False,True,112898,0.552001,0.551873,0.390984,0.390717,0.161017
27,tfidf,0.01,3,False,False,112898,0.551957,0.551831,0.390939,0.390676,0.161017
19,tfidf,0.025,3,False,False,113063,0.552521,0.552399,0.393916,0.393617,0.158605
18,tfidf,0.025,3,False,True,113063,0.552523,0.5524,0.393938,0.393638,0.158585
10,tfidf,0.05,3,False,True,113122,0.552878,0.552746,0.394485,0.394181,0.158393
11,tfidf,0.05,3,False,False,113122,0.552851,0.55272,0.39449,0.394188,0.158361


In [49]:
full_results2['of'] = full_results2['train_micro_f1'] - full_results2['test_micro_f1']

In [45]:
# best is 1.0, 5, true, false

cv = CountVectorizer(max_df = 1.0, min_df = 5, lowercase = True)
X_train_counts = cv.fit_transform(X_train)
X_test_counts = cv.transform(X_test)

# record vocab size for tracking
vocab_size = X_train_counts.shape[1]

# tf-idf version
print("TF-IDF vectorizing at " + str(datetime.datetime.now()))
tv = TfidfTransformer()
X_train_tfidf = tv.fit_transform(X_train_counts)
X_test_tfidf = tv.transform(X_test_counts)

# fit an SVM on tf-idf version
print("Fitting TF-IDF count SVM at " + str(datetime.datetime.now()))
model = LinearSVC(dual = False, random_state = 771, max_iter = 5000)
model.fit(X_train_tfidf, y_train)

# evaluate 
print("Evaluating TF-IDF SVM at " + str(datetime.datetime.now()))
tf_y_train_pred = model.predict(X_train_tfidf)
tf_y_test_pred = model.predict(X_test_tfidf)

tf_train_f1_micro = f1_score(y_train, tf_y_train_pred, average = 'micro')
tf_train_f1_macro = f1_score(y_train, tf_y_train_pred, average = 'macro')

tf_test_f1_micro = f1_score(y_test, tf_y_test_pred, average = 'micro')
tf_test_f1_macro = f1_score(y_test, tf_y_test_pred, average = 'macro')



TF-IDF vectorizing at 2020-11-22 19:35:20.515322
Fitting TF-IDF count SVM at 2020-11-22 19:35:22.018314
Evaluating TF-IDF SVM at 2020-11-22 19:38:07.356068


NameError: name 'pickle' is not defined

In [48]:
# save artefacts to file for reuse
with open('../models/best_svm.p', 'wb') as f:
    pickle.dump(model, f)

with open('../models/svm_tfidf.p', 'wb') as f:
    pickle.dump(tv, f)

with open('../models/svm_cv.p', 'wb') as f:
    pickle.dump(cv, f)

## Evaluate on BERT test set

In [55]:
bert_test = pd.read_csv("../data/final_eval_set.csv", index_col = 0)
int_cats = {"good": 0, "controversial": 1, "bad": 2}
X_bert_test = tv.transform(cv.transform(bert_test['text']))
y_pred_bert_test = model.predict(X_bert_test)
bert_test['svm_pred'] = pd.Series(y_pred_bert_test).map(int_cats)
bert_test['svm_match'] = bert_test['svm_pred'] == bert_test['true']

In [56]:
bert_test.groupby('svm_match').size().transform(lambda x: round(x/sum(x),3))

svm_match
False    0.609
True     0.391
dtype: float64

In [57]:
bert_test.head()

Unnamed: 0,bert_pred,true,text,bert_match,log_pred,log_match,svm_pred,svm_match
0,0,0,"One nickname for the city of Columbus, Ohio. C...",True,1,False,1,False
1,1,1,-adjective ;; used to denote an ignorant or fo...,True,1,True,1,True
2,2,0,someone that is really stupid.,False,2,False,2,False
3,2,2,Nasty pieces of poopoo that just will not come...,True,2,True,2,True
4,0,2,A hardedend clump of fecal matter attached to ...,False,2,True,1,False


In [58]:
bert_test.to_csv("../data/final_eval_set.csv")

## Example prediction 

In [39]:
tf_y_test_pred

array(['bad', 'controversial', 'controversial', ..., 'controversial',
       'good', 'bad'], dtype=object)

In [36]:
def pred_ex(text, count_vectorizer, tfidf_vectorizer, model):
    text = tfidf_vectorizer.transform(count_vectorizer.transform([text]))
    pred = model.predict(text)
    print(pred[0])


In [41]:
pred_ex("this could be a very good definition", cv, tv, svm_tf)
pred_ex("nonsense nonsense yada yada", cv, tv, svm_tf)
pred_ex("this is an example of a definition with interesting words and content like a car and a house", cv, tv, svm_tf)
pred_ex("merrily we roll along", cv, tv, svm_tf)
pred_ex("rudolph the red-nosed reindeer had a very shiny nose", cv, tv, svm_tf)

bad
bad
bad
controversial
bad


## Write train and test to file

In [29]:
train_rejoin = pd.DataFrame()
train_rejoin['defn'] = X_train
train_rejoin['cat'] = y_train

In [30]:
test_rejoin = pd.DataFrame()
test_rejoin['defn'] = X_test
test_rejoin['cat'] = y_test

In [27]:
train_rejoin.groupby('cat').size().transform(lambda x: x/sum(x))

cat
bad              0.333563
controversial    0.344133
good             0.322304
dtype: float64

In [42]:
train_rejoin.to_csv("../data/svm_data/train.csv", index = False)
test_rejoin.to_csv("../data/svm_data/test.csv", index = False)