# Model building

In [1]:
import sqlite3
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns

In [2]:
conn = sqlite3.connect('../redditPosts.sqlite')
c = conn.cursor()

In [3]:
cmd = 'SELECT * FROM Posts P JOIN PostBodyAndScore B ON P.id = B.id'
df_ = pd.read_sql(sql=cmd, con=conn)

In [4]:
print (df_.shape)
df_.head(2)

(76141, 11)


Unnamed: 0,id,subreddit,title,author,url,created,ext_link,id.1,body,score,upvote_ratio
0,koaw8g,finance,Tech's top seven companies added $3.4 trillion...,Obrocheetah,https://www.reddit.com/r/finance/comments/koaw...,1609505868,,koaw8g,[deleted],1,1.0
1,kobbka,finance,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,https://www.reddit.com/r/finance/comments/kobb...,1609507913,https://www.fintechinshorts.com/mogo-establish...,kobbka,,1,1.0


## Prototype Model: 
- Since Pennystocks has the most number of posts with body let's use it for a base model.

In [5]:
df1 = df_[ (df_.body != '[deleted]') & (df_.body != '[removed]') & (df_.body != '')   ]
print (df1.shape)
df1.head(2)

(16950, 11)


Unnamed: 0,id,subreddit,title,author,url,created,ext_link,id.1,body,score,upvote_ratio
97,ko6pbd,options,$STX $69 call dropped from .60 premium to .01 ...,Faizen22,https://www.reddit.com/r/options/comments/ko6p...,1609482915,,ko6pbd,This morning I created a new scanner that aler...,35,0.85
103,koaa9c,options,SPY 380/379 Dec 31 Bear Call Spread Trading at...,casserolejoe9599,https://www.reddit.com/r/options/comments/koaa...,1609502828,https://www.reddit.com/r/options/comments/koaa...,koaa9c,I had a Bear Call Spread 380/379 that was expi...,2,0.63


In [7]:
df = df_[["title", "author", "subreddit", "body", "upvote_ratio"]]
df.head(2)

Unnamed: 0,title,author,subreddit,body,upvote_ratio
0,Tech's top seven companies added $3.4 trillion...,Obrocheetah,finance,[deleted],1.0
1,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,finance,,1.0


## Text Preprocessing

In [8]:
import re
txt = "Something Some ABCD $ABCD and $ZZZ $acd and also ZZZ to the Moon a32 %#Q@*()"
re.sub("[^a-zA-Z]", " ", txt)

'Something Some ABCD  ABCD and  ZZZ  acd and also ZZZ to the Moon a     Q    '

In [10]:
import re
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
words_to_keep = ['above','all', 'below', 'further', 'until', 'under']

for word in words_to_keep:
    STOPWORDS.remove(word)

In [12]:
def prepare_text(text):
    if not text:
        return ""
    text = text.lower()
    # remove \n
    text = re.sub("\\n", " ", text)
    # remove 've, 're
    text = re.sub("[a-z]*\'[r,v]e", "", text)
    # remove 's, 't, 'r, 'v
    text = re.sub("[a-z]*\'[s,t,r,v]", "", text) 
    # Replace everything not a letter with a space
    text = re.sub("[^a-zA-Z]", " ", text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    return text

In [15]:
df['body_processed'] = df['body'].apply(prepare_text)
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,author,subreddit,body,upvote_ratio,body_processed
0,Tech's top seven companies added $3.4 trillion...,Obrocheetah,finance,[deleted],1.0,deleted
1,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,finance,,1.0,


In [16]:
df['title_processed'] = df['title'].apply(prepare_text)
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,author,subreddit,body,upvote_ratio,body_processed,title_processed
0,Tech's top seven companies added $3.4 trillion...,Obrocheetah,finance,[deleted],1.0,deleted,top seven companies added trillion value
1,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,finance,,1.0,,mogo establishes atm equity program mogo finan...


In [18]:
df['body_and_title'] = df['body_processed']+' '+df['title_processed']
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,author,subreddit,body,upvote_ratio,body_processed,title_processed,body_and_title
0,Tech's top seven companies added $3.4 trillion...,Obrocheetah,finance,[deleted],1.0,deleted,top seven companies added trillion value,deleted top seven companies added trillion value
1,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,finance,,1.0,,mogo establishes atm equity program mogo finan...,mogo establishes atm equity program mogo fina...


# Binarize the labels

- convert the upvote_ratio into 1s and not-ones
- name that column label

In [20]:
df["label"] = df["upvote_ratio"].apply(lambda x: int(float(x)==1.0))
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,title,author,subreddit,body,upvote_ratio,body_processed,title_processed,body_and_title,label
0,Tech's top seven companies added $3.4 trillion...,Obrocheetah,finance,[deleted],1.0,deleted,top seven companies added trillion value,deleted top seven companies added trillion value,1
1,Mogo: Establishes ATM Equity Program Mogo a fi...,fintechinshorts,finance,,1.0,,mogo establishes atm equity program mogo finan...,mogo establishes atm equity program mogo fina...,1


In [21]:
# Label distribution
df.groupby("label").count()

Unnamed: 0_level_0,title,author,subreddit,body,upvote_ratio,body_processed,title_processed,body_and_title
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,25196,25196,25196,25196,25196,25196,25196,25196
1,50945,50945,50945,50945,50945,50945,50945,50945


## TFIDF Vectorizer

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def prepare_TFIDF(df, feature_col, target_column):
    df_train, df_test = train_test_split(df, test_size=0.2,
                                         stratify=df["label"],
                                         random_state = 8848)
    vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    
    vectorizer.fit_transform(df_train[feature_col].values)
    
    X_train = vectorizer.transform(df_train[feature_col].values)
    X_test  = vectorizer.transform(df_test[feature_col].values)

    y_train = df_train[target_column].values
    y_test  = df_test[target_column].values

    return X_train, X_test, y_train, y_test, vectorizer

## Logistic Regression with Text Body only

In [23]:
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

print ( f"Shapes: \nX_train:\t{ X_train.shape}\nX_test :\t{X_test.shape},\
       \ny_train:\t{y_train.shape}, \ny_test :\t{y_test.shape}")

Shapes: 
X_train:	(60912, 50121)
X_test :	(15229, 50121),       
y_train:	(60912,), 
y_test :	(15229,)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [32]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [40]:
columns = ["Accuracy", "Pr_0", "Pr_1", "Re_0", "Re_1", "F1_0", "F1_1"]
df_res = pd.DataFrame([], columns = columns)

In [41]:
def update_results(df_res, model, X, y, model_name):
    ypred = model.predict(X)
    ac = accuracy_score(y.ravel(), ypred.ravel() )
    pr, re, f1, _ = precision_recall_fscore_support(y.ravel(), ypred.ravel() )
    df_res.loc[model_name, "Accuracy"] = np.round(100*ac, 2)
    df_res.loc[model_name, ["Pr_0", "Pr_1"]] = np.round(100*pr, 2)
    df_res.loc[model_name, ["Re_0", "Re_1"]] = np.round(100*re, 2)
    df_res.loc[model_name, ["F1_0", "F1_1"]] = np.round(100*f1, 2)
    return df_res

df_res = update_results(df_res, model, X_train, y_train,'LR : Body Only')
df_res.head()

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17


## Logistic Regression with Title only

In [43]:
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "title_processed",
                                                            "label")

In [44]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'LR : Title Only')
df_res.head()

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75


## Logistic Regression with Title + Body

In [45]:
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_and_title",
                                                            "label")

In [46]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'LR : Title + Body')
df_res.head()

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17


# Decision Trees

In [39]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [49]:
# Body only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

model = DecisionTreeClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Body Only')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04


In [50]:
# Title only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "title_processed",
                                                            "label")

model = DecisionTreeClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title Only')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93


In [52]:
# Body + Title only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_and_title",
                                                            "label")

model = DecisionTreeClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title + Body')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93


# Decision Tree: Grid Search CV

In [94]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import StratifiedKFold


In [79]:
def grid_search_CV(model, params, X_train, y_train):
    #max_depths = 
    #
    model_grid = GridSearchCV(estimator = model,
                              param_grid = params,
                              cv = StratifiedKFold(n_splits=5, random_state=8848, shuffle=True), 
                              verbose=2)
    grid_res = model_grid.fit(X_train, y_train)
    print ( "Besy Score:", grid_res.best_score_, grid_res.best_params_)
    return grid_res

In [87]:
model = DecisionTreeClassifier(random_state = 8848)
params = {"max_depth": [5, 6, 7, 8] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "title_processed",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ........................................max_depth=5; total time=   0.8s
[CV] END ........................................max_depth=5; total time=   0.7s
[CV] END ........................................max_depth=5; total time=   0.8s
[CV] END ........................................max_depth=5; total time=   0.9s
[CV] END ........................................max_depth=5; total time=   0.8s
[CV] END ........................................max_depth=6; total time=   0.8s
[CV] END ........................................max_depth=6; total time=   1.0s
[CV] END ........................................max_depth=6; total time=   1.3s
[CV] END ........................................max_depth=6; total time=   1.1s
[CV] END ........................................max_depth=6; total time=   1.0s
[CV] END ........................................max_depth=7; total time=   1.1s
[CV] END ........................................

In [88]:
model = DecisionTreeClassifier(random_state = 8848, max_depth=6)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title Only [CV]')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93
DT : Title + Body,98.97,98.18,99.36,98.71,99.09,98.44,99.23
DT : Title Only [CV],67.14,96.69,67.07,0.72,99.99,1.44,80.28
DT : Title + Body [CV],76.03,74.83,76.29,41.51,93.1,53.4,83.86


## Decision Tree: Grid Search CV Title + Body

In [89]:
model = DecisionTreeClassifier(random_state = 8848)
params = {"max_depth": [7, 8, 9, 10] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_and_title",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ........................................max_depth=7; total time=   3.0s
[CV] END ........................................max_depth=7; total time=   3.5s
[CV] END ........................................max_depth=7; total time=   2.6s
[CV] END ........................................max_depth=7; total time=   2.5s
[CV] END ........................................max_depth=7; total time=   2.6s
[CV] END ........................................max_depth=8; total time=   2.8s
[CV] END ........................................max_depth=8; total time=   2.7s
[CV] END ........................................max_depth=8; total time=   2.8s
[CV] END ........................................max_depth=8; total time=   2.8s
[CV] END ........................................max_depth=8; total time=   2.8s
[CV] END ........................................max_depth=9; total time=   3.1s
[CV] END ........................................

In [90]:
model = DecisionTreeClassifier(random_state = 8848, max_depth=9)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title + Body [CV]')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93
DT : Title + Body,98.97,98.18,99.36,98.71,99.09,98.44,99.23
DT : Title Only [CV],67.14,96.69,67.07,0.72,99.99,1.44,80.28
DT : Title + Body [CV],76.03,74.83,76.29,41.51,93.1,53.4,83.86


In [91]:
model = DecisionTreeClassifier(random_state = 8848)
params = {"max_depth": [7, 8, 9, 10] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")
grid_res = grid_search_CV(model, params, X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ........................................max_depth=7; total time=   2.0s
[CV] END ........................................max_depth=7; total time=   3.0s
[CV] END ........................................max_depth=7; total time=   3.9s
[CV] END ........................................max_depth=7; total time=   2.8s
[CV] END ........................................max_depth=7; total time=   3.1s
[CV] END ........................................max_depth=8; total time=   2.8s
[CV] END ........................................max_depth=8; total time=   2.8s
[CV] END ........................................max_depth=8; total time=   3.3s
[CV] END ........................................max_depth=8; total time=   3.2s
[CV] END ........................................max_depth=8; total time=   2.9s
[CV] END ........................................max_depth=9; total time=   3.2s
[CV] END ........................................

In [92]:
model = DecisionTreeClassifier(random_state = 8848, max_depth=9)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Body Only [CV]')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93
DT : Title + Body,98.97,98.18,99.36,98.71,99.09,98.44,99.23
DT : Title Only [CV],67.14,96.69,67.07,0.72,99.99,1.44,80.28
DT : Title + Body [CV],76.03,74.83,76.29,41.51,93.1,53.4,83.86
DT : Body Only [CV],77.28,81.86,76.39,40.27,95.59,53.98,84.92


## Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

In [96]:
# Body only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

model = RandomForestClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'RF : Body Only')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93
DT : Title + Body,98.97,98.18,99.36,98.71,99.09,98.44,99.23
DT : Title Only [CV],67.14,96.69,67.07,0.72,99.99,1.44,80.28
DT : Title + Body [CV],76.03,74.83,76.29,41.51,93.1,53.4,83.86
DT : Body Only [CV],77.28,81.86,76.39,40.27,95.59,53.98,84.92
RF : Body Only,83.53,99.83,80.27,50.32,99.96,66.91,89.04


## Random Forest : Grid Search CV

In [98]:
model = RandomForestClassifier(random_state = 8848)
params = {"max_depth": [7, 8], "n_estimators":[100, 200] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ......................max_depth=7, n_estimators=100; total time=   4.8s
[CV] END ......................max_depth=7, n_estimators=100; total time=   4.6s
[CV] END ......................max_depth=7, n_estimators=100; total time=   4.6s
[CV] END ......................max_depth=7, n_estimators=100; total time=   4.5s
[CV] END ......................max_depth=7, n_estimators=100; total time=   4.5s
[CV] END ......................max_depth=7, n_estimators=200; total time=   9.2s
[CV] END ......................max_depth=7, n_estimators=200; total time=   9.2s
[CV] END ......................max_depth=7, n_estimators=200; total time=   9.6s
[CV] END ......................max_depth=7, n_estimators=200; total time=   9.2s
[CV] END ......................max_depth=7, n_estimators=200; total time=   9.1s
[CV] END ......................max_depth=8, n_estimators=100; total time=   5.2s
[CV] END ......................max_depth=8, n_est

In [99]:
model = RandomForestClassifier(random_state = 8848)
params = {"max_depth": [8, 9, 10], "n_estimators":[200, 300] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ......................max_depth=8, n_estimators=200; total time=  10.6s
[CV] END ......................max_depth=8, n_estimators=200; total time=  10.3s
[CV] END ......................max_depth=8, n_estimators=200; total time=  11.5s
[CV] END ......................max_depth=8, n_estimators=200; total time=  10.9s
[CV] END ......................max_depth=8, n_estimators=200; total time=  10.2s
[CV] END ......................max_depth=8, n_estimators=300; total time=  16.1s
[CV] END ......................max_depth=8, n_estimators=300; total time=  16.0s
[CV] END ......................max_depth=8, n_estimators=300; total time=  15.5s
[CV] END ......................max_depth=8, n_estimators=300; total time=  15.6s
[CV] END ......................max_depth=8, n_estimators=300; total time=  15.8s
[CV] END ......................max_depth=9, n_estimators=200; total time=  11.6s
[CV] END ......................max_depth=9, n_est

In [100]:
# Body only
model = RandomForestClassifier(random_state = 8848, max_depth= 10, n_estimators=300)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'RF : Body Only [CV]')
df_res

Unnamed: 0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
LR : Body Only,80.19,78.0,80.87,55.92,92.2,65.14,86.17
LR : Title Only,71.29,72.73,71.14,21.21,96.07,32.84,81.75
LR : Title + Body,80.19,78.0,80.87,55.92,92.2,65.14,86.17
DT : Body Only,83.53,99.77,80.27,50.34,99.94,66.92,89.04
DT : Title Only,97.24,95.08,98.33,96.65,97.53,95.86,97.93
DT : Title + Body,98.97,98.18,99.36,98.71,99.09,98.44,99.23
DT : Title Only [CV],67.14,96.69,67.07,0.72,99.99,1.44,80.28
DT : Title + Body [CV],76.03,74.83,76.29,41.51,93.1,53.4,83.86
DT : Body Only [CV],77.28,81.86,76.39,40.27,95.59,53.98,84.92
RF : Body Only,83.53,99.83,80.27,50.32,99.96,66.91,89.04


**Few words:**
- The GridSearchCV takes the default score as *accuracy* I guess.
- We may have to use a differnt scoring as well.