## Regressions


In [1]:
## load packages

import pandas as pd
import datetime

#SKLEARN
from sklearn.model_selection import train_test_split

# for vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#For word handeling
import string 
import nltk
nltk.download('punkt') # you will probably need to do this
nltk.download('wordnet') # and this
nltk.download('stopwords') # aand this

# for classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


import numpy as np


import time

[nltk_data] Downloading package punkt to C:\Users\Liv
[nltk_data]     Nøhr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Liv
[nltk_data]     Nøhr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Liv
[nltk_data]     Nøhr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocess(text):
    low_text= text.lower()
    low_text = low_text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(low_text)
    porter = nltk.WordNetLemmatizer()
    lemmatizer=[porter.lemmatize(t) for t in tokens]
    stop_words_list = stoppelop
    sent_sw_removed = [i for i in lemmatizer if i not in stop_words_list]
    lemmas=[i for i in sent_sw_removed if i!='br']
    return lemmas # return a list of stems/lemmas

In [3]:
## Get data
df_c = pd.read_csv('df_comments_final.csv')
df_c = df_c.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df_c=df_c[df_c['Comments']!='No Comments']

df_c['year'] = df_c['Dates'].str[2:4]

#Divide into target and features
y= df_c['status']

X = df_c['Comments']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=161193)

In [4]:
with open('STOPlist.txt') as f:
    stoppelop = f.read().splitlines()
#stoppelop


In [5]:
#Vectorize - both using count and tfidf

start = time.time()
#using count and own preprocesser
vectorizerc = CountVectorizer(tokenizer=preprocess)

# The top N most frequent features:
#vectorizerc_most = CountVectorizer(max_features=80, tokenizer=preprocess)

#using tfidf vectorizer
#vectorizertfidf = TfidfVectorizer(tokenizer=preprocess)


#Using tfidf vectorizer w highest features
#vectorizertfidf = Tfidf_mostVectorizer(tokenizer=preprocess)

# fit and transform train and test set for each vectorizer:
X_train_c = vectorizerc.fit_transform(X_train)
#X_train_c_most = vectorizerc_most.fit_transform(X_train)
#X_train_tf = vectorizertfidf.fit_transform(X_train)


# Only tranform test set: never fit your vectorizer on the test set (it is cheating). Out-of-Vocabulary words are handled automatically be sklearn's vectorizer.
X_test_c = vectorizerc.transform(X_test)
#X_test_bow = vectorizerc_most.transform(X_test)
#X_test_tf = vectorizertfidf.transform(X_test)

end = time.time()
print(end-start)


655.4979617595673


### Check difference in precision of count and tf_idf vectorization (using lasso)

In [6]:
# classifier
lr = LogisticRegression(random_state=0, penalty = 'l1', solver = 'saga', max_iter=4000)

#training
lr.fit(X_train_c,y_train)

#testing
train_preds = lr.predict(X_train_c)
test_preds = lr.predict(X_test_c)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

training accuracy: 0.8260138065008499
testing accuracy: 0.7912373816213966


In [7]:
features = ['_'.join(s.split()) for s in vectorizerc.get_feature_names()]
coefs_df = pd.DataFrame.from_records(lr.coef_, columns=features)
coefs_df
print(coefs_df.T.sort_values(by=[0], ascending=False).head(50))
print()
print(coefs_df.T.sort_values(by=[0], ascending=True).head(50))

                         0
backtests         5.118388
declared          3.449085
inferential       3.191841
remarkably        3.135088
eloquent          2.916783
redemption        2.870802
recommed          2.792675
nobel             2.791912
recycled          2.625961
researeasch       2.592687
testsassignments  2.555164
sucess            2.546583
pset              2.444369
australian        2.394399
superfluous       2.365215
9am               2.327386
neuroscience      2.322469
coursera          2.308388
persian           2.269973
mistaken          2.261013
xcredit           2.253713
catchup           2.244231
caltech           2.209440
mentorship        2.205322
legendary         2.183779
arrogent          2.177932
jam               2.175048
china             2.174193
ditsy             2.163920
900               2.159900
smoked            2.154484
relevance         2.146844
expo20            2.130280
gd                2.129583
quarter           2.127280
summarized        2.127201
o

In [8]:
coefs_df.to_csv("coef_final.csv")

In [44]:
# classifier - 
lr = LogisticRegression(random_state=0, penalty = 'l1', solver = 'saga')

#training
lr.fit(X_train_tf,y_train)

#testing
train_preds = lr.predict(X_train_tf)
test_preds = lr.predict(X_test_tf)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

training accuracy: 0.8096923023554307
testing accuracy: 0.7964408367155792


### Check difference between Lasso and Ridge

In [8]:
start = time.time()

# classifier
lr = LogisticRegression(random_state=0, penalty = 'l2', solver = 'saga', max_iter=4000)

#training
lr.fit(X_train_c,y_train)

#testing
train_preds = lr.predict(X_train_c)
test_preds = lr.predict(X_test_c)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

end = time.time()

training accuracy: 0.8515801158636
testing accuracy: 0.7936830055156624
