# Model building

In [2]:
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [3]:
df_orig = pd.read_csv("df_processed.csv")
df_orig.replace(np.nan, "no_text", inplace=True)

In [4]:
df_orig.head(2)

Unnamed: 0.1,Unnamed: 0,id,subreddit,title,author,url,created,ext_link,id.1,body,score,upvote_ratio,body_processed,title_processed,author_processed,body_and_title,body_and_author,title_and_author,body_title_and_author
0,0,koaw8g,finance,Tech's top seven companies added $3.4 trillion...,Obrocheetah,https://www.reddit.com/r/finance/comments/koaw...,1609505868,no_text,koaw8g,[deleted],1,1.0,deleted,top seven companies added trillion value,obrocheetah,deleted top seven companies added trillion value,deleted obrocheetah,top seven companies added trillion value obroc...,deleted top seven companies added trillion val...
1,1,kobbka,finance,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,https://www.reddit.com/r/finance/comments/kobb...,1609507913,https://www.fintechinshorts.com/mogo-establish...,kobbka,no_text,1,1.0,no_text,mogo establishes atm equity program mogo finan...,fintechinshorts,no_text mogo establishes atm equity program mo...,no_text fintechinshorts,mogo establishes atm equity program mogo finan...,no_text mogo establishes atm equity program mo...


In [5]:
df_orig.columns

Index(['Unnamed: 0', 'id', 'subreddit', 'title', 'author', 'url', 'created',
       'ext_link', 'id.1', 'body', 'score', 'upvote_ratio', 'body_processed',
       'title_processed', 'author_processed', 'body_and_title',
       'body_and_author', 'title_and_author', 'body_title_and_author'],
      dtype='object')

In [16]:
df_ = df_orig[["subreddit", "score",
          'body_processed','title_processed', 'author_processed',
          'body_and_title', 'body_and_author', 'title_and_author',
          'body_title_and_author'
         ]]

df_.head(2)

Unnamed: 0,subreddit,score,body_processed,title_processed,author_processed,body_and_title,body_and_author,title_and_author,body_title_and_author
0,finance,1,deleted,top seven companies added trillion value,obrocheetah,deleted top seven companies added trillion value,deleted obrocheetah,top seven companies added trillion value obroc...,deleted top seven companies added trillion val...
1,finance,1,no_text,mogo establishes atm equity program mogo finan...,fintechinshorts,no_text mogo establishes atm equity program mo...,no_text fintechinshorts,mogo establishes atm equity program mogo finan...,no_text mogo establishes atm equity program mo...


In [17]:
print ("Posts with score bigger than average score:", (np.array(df_['score'].values)>=df_['score'].mean()).sum() )
print ("Posts with score smaller than average score:", (np.array(df_['score'].values)<df_['score'].mean()).sum() )

Posts with score bigger than average score: 26702
Posts with score smaller than average score: 514736


# Binarize the Score

- Looking into the score distribution barely going above 0 give  us a 60/40 splits.
- So, convert the score into less than or equal to 1 or bigger than 1
- name that column `score_label`

In [18]:
def binarize_score(df, threshold):
    df["score_label"] = df["score"].apply(lambda x: int(x>threshold))
    return df

In [19]:
df_ = binarize_score(df_, 1000)

In [20]:
df_.head(2)

Unnamed: 0,subreddit,score,body_processed,title_processed,author_processed,body_and_title,body_and_author,title_and_author,body_title_and_author,score_label
0,finance,1,deleted,top seven companies added trillion value,obrocheetah,deleted top seven companies added trillion value,deleted obrocheetah,top seven companies added trillion value obroc...,deleted top seven companies added trillion val...,0
1,finance,1,no_text,mogo establishes atm equity program mogo finan...,fintechinshorts,no_text mogo establishes atm equity program mo...,no_text fintechinshorts,mogo establishes atm equity program mogo finan...,no_text mogo establishes atm equity program mo...,0


## Only GME Data

In [21]:
df_GME = df_[df_.subreddit=='GME']
print ( df_GME.shape )
df_GME.head(2)

(145597, 10)


Unnamed: 0,subreddit,score,body_processed,title_processed,author_processed,body_and_title,body_and_author,title_and_author,body_title_and_author,score_label
68190,GME,6,watching took position right away https youtu ...,need see gme,titsdownonly,watching took position right away https youtu ...,watching took position right away https youtu ...,need see gme titsdownonly,watching took position right away https youtu ...,0
68191,GME,11,deleted,short squeeze incoming,zoomermoney,deleted short squeeze incoming,deleted zoomermoney,short squeeze incoming zoomermoney,deleted short squeeze incoming zoomermoney,0


In [22]:
print ("Label Distribution for df_GME")
count_0_1=df_GME.groupby("score_label").count()['subreddit'].values
print (f"Class 0 {np.round(100*count_0_1[0]/np.sum(count_0_1), 2)} %")
print (f"Class 1 {np.round(100*count_0_1[1]/np.sum(count_0_1), 2)} %")


Label Distribution for df_GME
Class 0 97.66 %
Class 1 2.34 %


In [23]:
# remove rows with deleted, removed and no_text  in the body
df_GME2 = df_GME[(df_GME.body_processed != 'deleted' ) &
          (df_GME.body_processed != 'removed' ) &
          (df_GME.body_processed != 'no_text' )]


In [24]:
print ("Label Distribution for df_GME after removing deleted posts")
count_0_1=df_GME2.groupby("score_label").count()['subreddit'].values
print (f"Class 0 {np.round(100*count_0_1[0]/np.sum(count_0_1), 2)} %")
print (f"Class 1 {np.round(100*count_0_1[1]/np.sum(count_0_1), 2)} %")


Label Distribution for df_GME after removing deleted posts
Class 0 97.19 %
Class 1 2.81 %


In [26]:
print (df_GME.shape[0], df_GME2.shape[0])

145597 49794


## Saving the test data

In [28]:
from sklearn.model_selection import train_test_split
df, df_test_ = train_test_split(df_GME2, test_size=0.2,
                                stratify=df_GME2['score_label'],
                                random_state = 8848)


In [29]:
# Label distribution
count_0_1=df.groupby("score_label").count()['subreddit'].values
print (f"Class 0 {np.round(100*count_0_1[0]/np.sum(count_0_1), 2)} %")
print (f"Class 1 {np.round(100*count_0_1[1]/np.sum(count_0_1), 2)} %")


Class 0 97.19 %
Class 1 2.81 %


## TFIDF Vectorizer

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

def prepare_TFIDF(df, feature_col, target_col):
    df = df[[feature_col, target_col]]
    df_train, df_test = train_test_split(df, test_size=0.2,
                                         stratify=df[target_col],
                                         random_state = 8848)
    vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    
    vectorizer.fit_transform(df_train[feature_col].values)
    
    X_train = vectorizer.transform(df_train[feature_col].values)
    X_test  = vectorizer.transform(df_test[feature_col].values)

    y_train = df_train[target_col].values
    y_test  = df_test[target_col].values

    return X_train, X_test, y_train, y_test, vectorizer

## Logistic Regression models

In [94]:
columns = ["Accuracy", "Pr_0", "Pr_1", "Re_0", "Re_1", "F1_0", "F1_1"]
df_res = pd.DataFrame([], columns = columns)

In [95]:
# A dataframe to store results
def update_results(df_res, model, X, y, model_name):
    ypred = model.predict(X)
    ac = accuracy_score(y.ravel(), ypred.ravel() )
    pr, re, f1, _ = precision_recall_fscore_support(y.ravel(), ypred.ravel() )
    df_res.loc[model_name, "Accuracy"] = np.round(100*ac, 2)
    df_res.loc[model_name, ["Pr_0", "Pr_1"]] = np.round(100*pr, 2)
    df_res.loc[model_name, ["Re_0", "Re_1"]] = np.round(100*re, 2)
    df_res.loc[model_name, ["F1_0", "F1_1"]] = np.round(100*f1, 2)
    return df_res

In [34]:
xcol="body_and_title";
ycol="score_label"

X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            xcol,
                                                            ycol)

In [35]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((31868, 58935), (7967, 58935), (31868,), (7967,))

In [36]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [37]:
pred = model.predict(X_test)

In [38]:
accuracy_score(y_test, pred)

0.9732647169574494

In [41]:
confusion_matrix(y_test, pred)

array([[7742,    1],
       [ 212,   12]])

In [43]:
sum(y_test==1), sum(pred==1)

(224, 13)

In [44]:
pr, re, f1, _ = precision_recall_fscore_support(y_test, pred)

In [45]:
pr

array([0.97334674, 0.92307692])

In [49]:
7742/(7742+212), 12/(12+1)

(0.9733467437767162, 0.9230769230769231)

In [92]:
wt_factor=5.
w= np.ones(len(y_train))
w[y_train==1] = wt_factor/y_train.mean()
w[20:50]

array([  1.        ,   1.        ,   1.        ,   1.        ,
       178.03351955,   1.        ,   1.        ,   1.        ,
         1.        ,   1.        ,   1.        , 178.03351955,
         1.        ,   1.        ,   1.        ,   1.        ,
         1.        ,   1.        ,   1.        ,   1.        ,
         1.        ,   1.        ,   1.        ,   1.        ,
         1.        ,   1.        ,   1.        ,   1.        ,
         1.        ,   1.        ])

### GridSearch focussing on Precision and recalls

In [None]:

weights = np.ones(y_train.shape)
weights[y_train == 1] = weight_factor/np.mean(y_train)
gridModel.fit(X_train, y_train, sample_weight = weights)

In [84]:
def print_metrics(model, X, y):
    pred = model.predict(X)
    conf = confusion_matrix(y, pred)
    ac   = accuracy_score(y, pred)
    pr, re, f1, _ = precision_recall_fscore_support(y, pred)
    
    print ("Confusion Matrix:\n", conf)
    print (f"Accuracy Score: {np.round(100*ac,2)}")
    print (f"Precision Score: {np.round(100*pr,2)}")
    print (f"Recall Score: {np.round(100*re,2)}")
    print (f"F1 Score: {np.round(100*f1,2)}")


In [77]:
def grid_search_CV_LR(params, df=df,
                      xcol="body_processed",
                      ycol="score_label",
                      N_cv=5,
                      wt_factor=1,
                      scorer = "precision",
                      verbose = 0,
                     ):
    X_train, X_test, y_train, y_test, vectorizer = prepare_TFIDF(df,
                                                                xcol,
                                                                ycol)
    
    scoring_dict = {"precision": precision_score, 
                    "recall"   : recall_score,
                    "accuracy" : accuracy_score
                   }
    
    custom_score = make_scorer(scoring_dict[scorer])

    
    model_grid = GridSearchCV(estimator = LogisticRegression(max_iter=1000),
                              param_grid = params,
                              cv = StratifiedKFold(n_splits=N_cv,
                                                   random_state=8848,
                                                   shuffle=True),
                              verbose=verbose,
                              scoring = custom_score,
                             )
    
    w = np.ones(len(y_train))
    w[y_train==1] = wt_factor/y_train.mean()
    
    grid_res = model_grid.fit(X_train, y_train, sample_weight=w)
    print ( "Best Score:", grid_res.best_score_, grid_res.best_params_)
    return model_grid, X_train, X_test, y_train, y_test, grid_res


In [73]:
# For penalty l1 preferrable for so many features/weights
params = {'penalty': ['l1','l2','None'],
          'C'      : [1, 10, 20]}

model_grid, X_train, X_test, y_train, y_test, grid_res = grid_search_CV_LR(params)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ..............................C=0.5, penalty=l1; total time=   0.0s
[CV 2/5] END ..............................C=0.5, penalty=l1; total time=   0.0s
[CV 3/5] END ..............................C=0.5, penalty=l1; total time=   0.0s
[CV 4/5] END ..............................C=0.5, penalty=l1; total time=   0.0s
[CV 5/5] END ..............................C=0.5, penalty=l1; total time=   0.0s
[CV 1/5] END ..............................C=0.5, penalty=l2; total time=   1.1s
[CV 2/5] END ..............................C=0.5, penalty=l2; total time=   1.2s
[CV 3/5] END ..............................C=0.5, penalty=l2; total time=   1.2s
[CV 4/5] END ..............................C=0.5, penalty=l2; total time=   1.1s
[CV 5/5] END ..............................C=0.5, penalty=l2; total time=   1.2s
[CV 1/5] END ............................C=0.5, penalty=None; total time=   0.0s
[CV 2/5] END ............................C=0.5, 

In [74]:
print_metrics(model_grid, X_train, y_train)

For Training Data
[[30491   482]
 [    5   890]]
0.9847182126270867 [0.99983604 0.64868805] [0.98443806 0.99441341] [0.99207731 0.78517865]


In [78]:
# For penalty l1 preferrable for so many features/weights
params = {'penalty': ['l1','l2'],
          'C'      : [1, 10, 20]}

model_grid, X_train, X_test, y_train, y_test, grid_res = grid_search_CV_LR(params,
                                                                           scorer = "precision")

print_metrics(model_grid, X_train, y_train)

Best Score: 0.1557259861698394 {'C': 10, 'penalty': 'l2'}
[[30491   482]
 [    5   890]]
0.9847182126270867 [0.99983604 0.64868805] [0.98443806 0.99441341] [0.99207731 0.78517865]


In [85]:
print_metrics(model_grid, X_test, y_test)


Confusion Matrix:
 [[7398  345]
 [ 157   67]]
Accuracy Score: 93.7
Precision Score: [97.92 16.26]
Recall Score: [95.54 29.91]
F1 Score: [96.72 21.07]


In [93]:
# For penalty l1 preferrable for so many features/weights
params = {'penalty': ['l1','l2'],
          'C'      : [5, 10, 15]}

model_grid, X_train, X_test, y_train, y_test, grid_res = grid_search_CV_LR(params,
                                                                           scorer = "precision",
                                                                           wt_factor=5.)

print_metrics(model_grid, X_test, y_test)


Best Score: 0.12396957833725034 {'C': 15, 'penalty': 'l2'}
Confusion Matrix:
 [[7213  530]
 [ 152   72]]
Accuracy Score: 91.44
Precision Score: [97.94 11.96]
Recall Score: [93.16 32.14]
F1 Score: [95.49 17.43]


In [96]:
df_res = update_results(df_res, model_grid, X_test, y_test, "scr:Precision, w:.5")
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
"scr:Precision, w:.5",91.44,97.94,11.96,93.16,32.14,95.49,17.43


In [97]:
# For penalty l1 preferrable for so many features/weights
params = {'penalty': ['l1','l2'],
          'C'      : [5, 10, 15]}

model_grid, X_train, X_test, y_train, y_test, grid_res = grid_search_CV_LR(params,
                                                                           scorer = "recall",
                                                                           wt_factor=5.)

print ("Confusion Matrix:\n", confusion_matrix(y_test, model_grid.predict(X_test)) )
df_res = update_results(df_res, model_grid, X_test, y_test, "scr:Recall, w:.5")
df_res

Best Score: 0.3620111731843575 {'C': 5, 'penalty': 'l2'}
Confusion Matrix:
 [[7050  693]
 [ 135   89]]


Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
"scr:Precision, w:.5",91.44,97.94,11.96,93.16,32.14,95.49,17.43
"scr:Recall, w:.5",89.61,98.12,11.38,91.05,39.73,94.45,17.69


In [100]:
df_res=df_res.drop("scr:Recall, w:5.", axis=0) 
df_res = update_results(df_res, model_grid, X_test, y_test, "scr=Recall, w=5.")
df_res


Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
"scr:Precision, w:.5",91.44,97.94,11.96,93.16,32.14,95.49,17.43
"scr=Recall, w=5.",89.61,98.12,11.38,91.05,39.73,94.45,17.69


## LSTM