## Regressions


In [21]:
## load packages

import pandas as pd
import datetime

#SKLEARN
from sklearn.model_selection import train_test_split

# for vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#For word handeling
import string 
import nltk
nltk.download('punkt') # you will probably need to do this
nltk.download('wordnet') # and this
nltk.download('stopwords') # aand this

# for classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


import numpy as np


import time

[nltk_data] Downloading package punkt to /Users/mieharder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mieharder/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mieharder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def preprocess(text):
    low_text= text.lower()
    low_text = low_text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(low_text)
    porter = nltk.WordNetLemmatizer()
    lemmatizer=[porter.lemmatize(t) for t in tokens]
    stop_words_list = stoppelop
    sent_sw_removed = [i for i in lemmatizer if i not in stop_words_list]
    lemmas=[i for i in sent_sw_removed if i!='br']
    return lemmas # return a list of stems/lemmas

In [26]:
## Get data
df_c = pd.read_csv('df_comments_final.csv')
df_c = df_c.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df_c=df_c[df_c['Comments']!='No Comments']

df_c['year'] = df_c['Dates'].str[2:4]

#Divide into target and features
y= df_c['status']

X = df_c['Comments']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=161193)

In [23]:
with open('STOPlist.txt') as f:
    stoppelop = f.read().splitlines()
#stoppelop


In [27]:
#Vectorize - both using count and tfidf

start = time.time()
#using count and own preprocesser
vectorizerc = CountVectorizer(tokenizer=preprocess)

# The top N most frequent features:
#vectorizerc_most = CountVectorizer(max_features=80, tokenizer=preprocess)

#using tfidf vectorizer
vectorizertfidf = TfidfVectorizer(tokenizer=preprocess)


#Using tfidf vectorizer w highest features
#vectorizertfidf = Tfidf_mostVectorizer(tokenizer=preprocess)

# fit and transform train and test set for each vectorizer:
X_train_c = vectorizerc.fit_transform(X_train)
#X_train_c_most = vectorizerc_most.fit_transform(X_train)
#X_train_tf = vectorizertfidf.fit_transform(X_train)


# Only tranform test set: never fit your vectorizer on the test set (it is cheating). Out-of-Vocabulary words are handled automatically be sklearn's vectorizer.
X_test_c = vectorizerc.transform(X_test)
#X_test_bow = vectorizerc_most.transform(X_test)
#X_test_tf = vectorizertfidf.transform(X_test)

end = time.time()
print(end-start)


442.80586218833923


### Check difference in precision of count and tf_idf vectorization (using lasso)

In [None]:
start = time.time()

# classifier
lr = LogisticRegression(random_state=0, penalty = 'l1', solver = 'saga', max_iter=4000)

#training
lr.fit(X_train_c,y_train)

#testing
train_preds = lr.predict(X_train_c)
test_preds = lr.predict(X_test_c)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

end = time.time()
print(end-start)

In [20]:
features = ['_'.join(s.split()) for s in vectorizerc.get_feature_names()]
coefs_df = pd.DataFrame.from_records(lr.coef_, columns=features)
coefs_df
print(coefs_df.T.sort_values(by=[0], ascending=False).head(20))
print()
print(coefs_df.T.sort_values(by=[0], ascending=True).head(20))

                      0
d2l            5.717260
backtests      5.192373
eco            3.917293
aap            3.824813
std            3.766660
cssw           3.506478
declared       3.469832
psets          3.195103
inferential    3.194954
remarkably     3.151806
fanis          3.102304
materialstudy  3.098247
redemption     2.975836
eloquent       2.964848
nobel          2.892067
recommed       2.857539
mat            2.833813
researeasch    2.752742
recycled       2.711949
sucess         2.610963

                    0
pertaining  -2.932618
lawyer      -2.627402
wac         -2.518342
preaches    -2.041941
cm          -2.039950
uh          -1.952922
acct        -1.917501
gi          -1.909742
ae          -1.893546
hybrid      -1.868063
snack       -1.845940
csus        -1.804335
kumu        -1.776401
gov         -1.759752
purchase    -1.757204
smallest    -1.710018
breath      -1.696679
trig        -1.616466
blog        -1.606236
resourceful -1.601962


In [44]:
# classifier
lr = LogisticRegression(random_state=0, penalty = 'l1', solver = 'saga')

#training
lr.fit(X_train_tf,y_train)

#testing
train_preds = lr.predict(X_train_tf)
test_preds = lr.predict(X_test_tf)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

training accuracy: 0.8096923023554307
testing accuracy: 0.7964408367155792


### Check difference between Lasso and Ridge

In [8]:
start = time.time()

# classifier
lr = LogisticRegression(random_state=0, penalty = 'l2', solver = 'saga', max_iter=4000)

#training
lr.fit(X_train_c,y_train)

#testing
train_preds = lr.predict(X_train_c)
test_preds = lr.predict(X_test_c)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

end = time.time()

training accuracy: 0.8515801158636
testing accuracy: 0.7936830055156624
