# Model building with `Score` as target variable

In [1]:
import numpy as np
import pandas as pd
import pylab as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_orig = pd.read_csv("df_processed.csv")
df_orig.replace(np.nan, "no_text", inplace=True)

In [3]:
df_ = df_orig[["subreddit", "score",
          'body_processed','title_processed', 'author_processed',
          'body_and_title', 'body_and_author', 'title_and_author',
          'body_title_and_author'
         ]]

df_.head(2)

Unnamed: 0,subreddit,score,body_processed,title_processed,author_processed,body_and_title,body_and_author,title_and_author,body_title_and_author
0,finance,1,deleted,top seven companies added trillion value,obrocheetah,deleted top seven companies added trillion value,deleted obrocheetah,top seven companies added trillion value obroc...,deleted top seven companies added trillion val...
1,finance,1,no_text,mogo establishes atm equity program mogo finan...,fintechinshorts,no_text mogo establishes atm equity program mo...,no_text fintechinshorts,mogo establishes atm equity program mogo finan...,no_text mogo establishes atm equity program mo...


In [5]:
print ("Posts with score bigger than average score:", (np.array(df_['score'].values)>=df_['score'].mean()).sum() )
print ("Posts with score smaller than average score:", (np.array(df_['score'].values)<df_['score'].mean()).sum() )

Posts with score bigger than average score: 26702
Posts with score smaller than average score: 514736


# Binarize the Score

- Looking into the score distribution barely going above 0 give  us a 60/40 splits.
- So, convert the score into less than or equal to 1 or bigger than 1
- name that column `score_label`

In [6]:
df_["score_label"] = df_["score"].apply(lambda x: int(x<=1.0))
df_.head(2)

Unnamed: 0,subreddit,score,body_processed,title_processed,author_processed,body_and_title,body_and_author,title_and_author,body_title_and_author,score_label
0,finance,1,deleted,top seven companies added trillion value,obrocheetah,deleted top seven companies added trillion value,deleted obrocheetah,top seven companies added trillion value obroc...,deleted top seven companies added trillion val...,1
1,finance,1,no_text,mogo establishes atm equity program mogo finan...,fintechinshorts,no_text mogo establishes atm equity program mo...,no_text fintechinshorts,mogo establishes atm equity program mogo finan...,no_text mogo establishes atm equity program mo...,1


## Saving the test data

In [11]:
from sklearn.model_selection import train_test_split

df, df_test_ = train_test_split(df_, test_size=0.2,
                                stratify=df_['score_label'],
                                random_state = 8848)


In [10]:
# Label distribution
count_0_1=df.groupby("score_label").count()['subreddit'].values
print (f"Class 0 {np.round(100*count_0_1[0]/np.sum(count_0_1), 2)} %")
print (f"Class 1 {np.round(100*count_0_1[1]/np.sum(count_0_1), 2)} %")


Class 0 40.81 %
Class 1 59.19 %


## TFIDF Vectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

def prepare_TFIDF(df, feature_col, target_col):
    df = df[[feature_col, target_col]]
    df_train, df_test = train_test_split(df, test_size=0.2,
                                         stratify=df[target_col],
                                         random_state = 8848)
    vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95)
    
    vectorizer.fit_transform(df_train[feature_col].values)
    
    X_train = vectorizer.transform(df_train[feature_col].values)
    X_test  = vectorizer.transform(df_test[feature_col].values)

    y_train = df_train[target_col].values
    y_test  = df_test[target_col].values

    return X_train, X_test, y_train, y_test, vectorizer

## Logistic Regression models

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

In [43]:
columns = ["Accuracy", "Pr_0", "Pr_1", "Re_0", "Re_1", "F1_0", "F1_1"]
df_res = pd.DataFrame([], columns = columns)

In [45]:
# A dataframe to store results
def update_results(df_res, model, X, y, model_name):
    ypred = model.predict(X)
    ac = accuracy_score(y.ravel(), ypred.ravel() )
    pr, re, f1, _ = precision_recall_fscore_support(y.ravel(), ypred.ravel() )
    df_res.loc[model_name, "Accuracy"] = np.round(100*ac, 2)
    df_res.loc[model_name, ["Pr_0", "Pr_1"]] = np.round(100*pr, 2)
    df_res.loc[model_name, ["Re_0", "Re_1"]] = np.round(100*re, 2)
    df_res.loc[model_name, ["F1_0", "F1_1"]] = np.round(100*f1, 2)
    return df_res

In [50]:
def logistic_regression(df=df, xcol="body_processed", ycol="score_label"):
    X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                                xcol,
                                                                ycol)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model, X_train, X_test, y_train, y_test
    

## Logistic Regression with Body only

In [51]:
model, X_train, X_test, y_train, y_test = logistic_regression(xcol="body_processed")
df_res = update_results(df_res, model, X_test, y_test, 'LR : Body Only')
df_res

Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.84,65.26,76.51,66.31,75.66,65.78,76.08
LR : Title Only,68.76,67.75,69.14,44.77,85.3,53.92,76.37
LR : Title + Body,75.52,73.54,76.58,62.52,84.49,67.58,80.34
LR : Title + Body + Author,81.21,80.59,81.56,71.08,88.19,75.54,84.75
SVM: Body only [CV],41.02,38.55,50.5,74.96,17.62,50.92,26.12
SVM: Title + Body [CV],59.19,49.84,59.26,0.9,99.38,1.77,74.24


In [52]:
conf_matrix = confusion_matrix(y_train, model.predict(X_train))
conf_matrix

array([[ 93647,  47775],
       [ 50063, 155035]])

## Logistic Regression with Title only

In [53]:
model, X_train, X_test, y_train, y_test = logistic_regression(xcol="title_processed")
df_res = update_results(df_res, model, X_test, y_test, 'LR : Title Only')
df_res

Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.84,65.26,76.51,66.31,75.66,65.78,76.08
LR : Title Only,65.67,62.17,66.95,40.6,82.96,49.12,74.1
LR : Title + Body,75.52,73.54,76.58,62.52,84.49,67.58,80.34
LR : Title + Body + Author,81.21,80.59,81.56,71.08,88.19,75.54,84.75
SVM: Body only [CV],41.02,38.55,50.5,74.96,17.62,50.92,26.12
SVM: Title + Body [CV],59.19,49.84,59.26,0.9,99.38,1.77,74.24


## Logistic Regression with Title + Body

In [54]:
model, X_train, X_test, y_train, y_test = logistic_regression(xcol="body_and_title")
df_res = update_results(df_res, model, X_test, y_test, 'LR : Title + Body')
df_res

Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.84,65.26,76.51,66.31,75.66,65.78,76.08
LR : Title Only,65.67,62.17,66.95,40.6,82.96,49.12,74.1
LR : Title + Body,73.07,69.65,74.94,60.29,81.88,64.63,78.26
LR : Title + Body + Author,81.21,80.59,81.56,71.08,88.19,75.54,84.75
SVM: Body only [CV],41.02,38.55,50.5,74.96,17.62,50.92,26.12
SVM: Title + Body [CV],59.19,49.84,59.26,0.9,99.38,1.77,74.24


## Logistic Regression with Title + Body + author

In [55]:
model, X_train, X_test, y_train, y_test = logistic_regression(xcol="body_title_and_author")
df_res = update_results(df_res, model, X_test, y_test, 'LR : Title + Body + Author')
df_res

Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.84,65.26,76.51,66.31,75.66,65.78,76.08
LR : Title Only,65.67,62.17,66.95,40.6,82.96,49.12,74.1
LR : Title + Body,73.07,69.65,74.94,60.29,81.88,64.63,78.26
LR : Title + Body + Author,75.9,73.31,77.34,64.38,83.84,68.56,80.46
SVM: Body only [CV],41.02,38.55,50.5,74.96,17.62,50.92,26.12
SVM: Title + Body [CV],59.19,49.84,59.26,0.9,99.38,1.77,74.24


In [56]:
# save LR results
dftmp = df_res.drop(["SVM: Body only [CV]", "SVM: Title + Body [CV]"] , axis=0)

In [59]:
dftmp.to_csv("df_res_LR.csv")

## Logistic Regression Results
| Method | Accuracy | Pr_0 | Pr_1 | Re_0 | Re_1 | F1_0 | F1_1 | 
| --- | --- | --- | --- | --- | --- | --- | --- |
| LR : Body Only | 71.82 | 65.29 | 76.43 | 66.11 | 75.77 | 65.7 | 76.09 | 
| LR : Title Only | 68.76 | 67.75 | 69.14 | 44.77 | 85.3 | 53.92 | 76.37 | 
| LR : Title + Body | 75.52 | 73.54 | 76.58 | 62.52 | 84.49 | 67.58 | 80.34 | 
| LR : Title + Body + Author | 81.21 | 80.59 | 81.56 | 71.08 | 88.19 | 75.54 | 84.75 | 

## Support Vector Machine

In [44]:
# load the LR results
#df_res = pd.read_csv('df_res_LR.csv')
#df_res.set_index("Method", drop=True, inplace=True)
#df_res

Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.82,65.29,76.43,66.11,75.77,65.7,76.09
LR : Title Only,68.76,67.75,69.14,44.77,85.3,53.92,76.37
LR : Title + Body,75.52,73.54,76.58,62.52,84.49,67.58,80.34
LR : Title + Body + Author,81.21,80.59,81.56,71.08,88.19,75.54,84.75


In [25]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import StratifiedKFold

In [15]:
xcol="body_processed";
ycol="score_label"
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            xcol,
                                                            ycol)



In [26]:
def grid_search_CV_SVM(params, df=df, xcol="body_processed", ycol="score_label", N_cv=5):
    X_train, X_test, y_train, y_test, vectorizer = prepare_TFIDF(df,
                                                                xcol,
                                                                ycol)
    
    model_grid = GridSearchCV(estimator = SVC(kernel='rbf', degree=3, max_iter=1000),
                              param_grid = params,
                              cv = StratifiedKFold(n_splits=N_cv,
                                                   random_state=8848,
                                                   shuffle=True),
                              verbose=3)
    grid_res = model_grid.fit(X_train, y_train)
    print ( "Best Score:", grid_res.best_score_, grid_res.best_params_)
    return model_grid, X_train, X_test, y_train, y_test, grid_res

In [27]:
params = {"C":[5, 10, 15]}

model_grid, X_train, X_test, y_train, y_test, grid_res = grid_search_CV_SVM(params,
                                                                            xcol="body_processed")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ...............................C=5;, score=0.688 total time=  36.6s
[CV 2/5] END ...............................C=5;, score=0.686 total time=  30.8s
[CV 3/5] END ...............................C=5;, score=0.665 total time=  27.2s
[CV 4/5] END ...............................C=5;, score=0.660 total time=  30.3s
[CV 5/5] END ...............................C=5;, score=0.690 total time=  25.5s
[CV 1/5] END ..............................C=10;, score=0.686 total time=  27.9s
[CV 2/5] END ..............................C=10;, score=0.377 total time=  42.8s
[CV 3/5] END ..............................C=10;, score=0.687 total time=  24.7s
[CV 4/5] END ..............................C=10;, score=0.701 total time=  21.7s
[CV 5/5] END ..............................C=10;, score=0.654 total time=  22.9s
[CV 1/5] END ..............................C=15;, score=0.652 total time=  34.6s
[CV 2/5] END ..............................C=15;,

In [46]:
df_res = update_results(df_res, model_grid, X_test, y_test, 'SVM: Body only [CV]')
df_res

Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.82,65.29,76.43,66.11,75.77,65.7,76.09
LR : Title Only,68.76,67.75,69.14,44.77,85.3,53.92,76.37
LR : Title + Body,75.52,73.54,76.58,62.52,84.49,67.58,80.34
LR : Title + Body + Author,81.21,80.59,81.56,71.08,88.19,75.54,84.75
SVM: Body only [CV],41.02,38.55,50.5,74.96,17.62,50.92,26.12


In [47]:
params = {"C":[5, 10]}

model_grid, X_train, X_test, y_train, y_test, grid_res = grid_search_CV_SVM(params,
                                                                            xcol="body_and_title")

df_res = update_results(df_res, model_grid, X_test, y_test, 'SVM: Title + Body [CV]')
df_res

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END ...............................C=5;, score=0.517 total time=  38.6s
[CV 2/5] END ...............................C=5;, score=0.625 total time=  26.1s
[CV 3/5] END ...............................C=5;, score=0.591 total time=  34.3s
[CV 4/5] END ...............................C=5;, score=0.489 total time=  36.5s
[CV 5/5] END ...............................C=5;, score=0.595 total time=  33.4s
[CV 1/5] END ..............................C=10;, score=0.454 total time=  32.3s
[CV 2/5] END ..............................C=10;, score=0.495 total time=  33.7s
[CV 3/5] END ..............................C=10;, score=0.467 total time=  37.0s
[CV 4/5] END ..............................C=10;, score=0.411 total time=  31.4s
[CV 5/5] END ..............................C=10;, score=0.432 total time=  28.2s
Best Score: 0.5632113586517373 {'C': 5}


Unnamed: 0_level_0,Accuracy,Pr_0,Pr_1,Re_0,Re_1,F1_0,F1_1
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LR : Body Only,71.82,65.29,76.43,66.11,75.77,65.7,76.09
LR : Title Only,68.76,67.75,69.14,44.77,85.3,53.92,76.37
LR : Title + Body,75.52,73.54,76.58,62.52,84.49,67.58,80.34
LR : Title + Body + Author,81.21,80.59,81.56,71.08,88.19,75.54,84.75
SVM: Body only [CV],41.02,38.55,50.5,74.96,17.62,50.92,26.12
SVM: Title + Body [CV],59.19,49.84,59.26,0.9,99.38,1.77,74.24


In [48]:
pred=model_grid.predict(X_test)

In [49]:
sum(pred), len(pred)

(85992, 86630)

In [22]:
model_svm = SVC(C=10.0, kernel='rbf',
                degree=3, gamma='scale',
                max_iter=1000)
model_svm.fit(X_train, y_train)
pred = model_svm.predict(X_test)
print ( accuracy_score(y_test, pred),
       pred.sum(),
       y_test.sum()
      )

0.6852937781369041 54784 51275


In [23]:
print ( accuracy_score(y_train, model_svm.predict(X_train)) )

0.6874379545192196


In [14]:
def train_SVM(df=df, xcol="body_processed", ycol="score_label"):
    X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                                xcol,
                                                                ycol)

    model_svm = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', max_iter=500)
    model_svm.fit(X_train, y_train)
    return model_svm, X_train, X_test, y_train, y_test


In [15]:
model_svm, X_train, X_test, y_train, y_test = train_SVM(df=df, xcol="body_processed")

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
model_svm = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', max_iter=500)
model_svm.fit()

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)


# Decision Trees

In [23]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [24]:
def decision_tree(df=df, xcol="body_processed", ycol="score_label"):
    X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                                xcol,
                                                                ycol)
    model = DecisionTreeClassifier(random_state = 8848, max_depth=2)
    model.fit(X_train, y_train)
    return model, X_train, X_test, y_train, y_test

## Decision Tree with Body + Title + Author

In [25]:
model, X_train, X_test, y_train, y_test = decision_tree(xcol="body_title_and_author")

In [None]:
model, X_train, X_test, y_train, y_test = decision_tree(xcol="body_title_and_author")
df_res = update_results(df_res, model, X_train, y_train, 'DT : Title + Body + Author')
df_res

## Decision Tree : Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import StratifiedKFold

In [None]:
def grid_search_CV(params, df=df, xcol="body_processed", ycol="label", N_cv=5):
    X_train, X_test, y_train, y_test, vectorizer = prepare_TFIDF(df,
                                                                xcol,
                                                                ycol)
    
    model_grid = GridSearchCV(estimator = DecisionTreeClassifier(random_state = 8848),
                              param_grid = params,
                              cv = StratifiedKFold(n_splits=N_cv,
                                                   random_state=8848,
                                                   shuffle=True),
                              verbose=3)
    grid_res = model_grid.fit(X_train, y_train)
    print ( "Best Score:", grid_res.best_score_, grid_res.best_params_)
    return model_grid, X_train, X_test, y_train, y_test, grid_res

## Decision Tree : CV, Body Only

In [None]:
params = {"max_depth": [4, 5, 6] }
model, X_train, X_test, y_train, y_test, grid_res = grid_search_CV(params,
                                                                   xcol="body_processed")
df_res = update_results(df_res, model, X_train, y_train, 'DT : Body Only [CV]')
df_res

In [None]:
params = {"max_depth": [4, 5, 6, 10, 20] }
model, X_train, X_test, y_train, y_test, grid_res = grid_search_CV(params,
                                                                   xcol="title_processed")
df_res = update_results(df_res, model, X_train, y_train, 'DT : Title Only [CV]')
df_res

In [None]:
params = {"max_depth": [10, 15, 20, 25] }
model, X_train, X_test, y_train, y_test, grid_res = grid_search_CV(params,
                                                                   xcol="title_processed")


# Decision Trees

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Body only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

model = DecisionTreeClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Body Only')
df_res

In [None]:
# Title only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "title_processed",
                                                            "label")

model = DecisionTreeClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title Only')
df_res

In [None]:
# Body + Title only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_and_title",
                                                            "label")

model = DecisionTreeClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title + Body')
df_res

# Decision Tree: Grid Search CV

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import StratifiedKFold


In [None]:
def grid_search_CV(model, params, X_train, y_train):
    #max_depths = 
    #
    model_grid = GridSearchCV(estimator = model,
                              param_grid = params,
                              cv = StratifiedKFold(n_splits=5, random_state=8848, shuffle=True), 
                              verbose=2)
    grid_res = model_grid.fit(X_train, y_train)
    print ( "Besy Score:", grid_res.best_score_, grid_res.best_params_)
    return grid_res

In [None]:
model = DecisionTreeClassifier(random_state = 8848)
params = {"max_depth": [5, 6, 7, 8] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "title_processed",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

In [None]:
model = DecisionTreeClassifier(random_state = 8848, max_depth=6)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title Only [CV]')
df_res

## Decision Tree: Grid Search CV Title + Body

In [None]:
model = DecisionTreeClassifier(random_state = 8848)
params = {"max_depth": [7, 8, 9, 10] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_and_title",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

In [None]:
model = DecisionTreeClassifier(random_state = 8848, max_depth=9)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Title + Body [CV]')
df_res

In [None]:
model = DecisionTreeClassifier(random_state = 8848)
params = {"max_depth": [7, 8, 9, 10] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")
grid_res = grid_search_CV(model, params, X_train, y_train)

In [None]:
model = DecisionTreeClassifier(random_state = 8848, max_depth=9)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'DT : Body Only [CV]')
df_res

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Body only
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

model = RandomForestClassifier(random_state = 8848)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'RF : Body Only')
df_res

## Random Forest : Grid Search CV

In [None]:
model = RandomForestClassifier(random_state = 8848)
params = {"max_depth": [7, 8], "n_estimators":[100, 200] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

In [None]:
model = RandomForestClassifier(random_state = 8848)
params = {"max_depth": [8, 9, 10], "n_estimators":[200, 300] }
X_train, X_test, y_train, y_test, vectorizer= prepare_TFIDF(df,
                                                            "body_processed",
                                                            "label")

grid_res = grid_search_CV(model, params, X_train, y_train)

In [None]:
# Body only
model = RandomForestClassifier(random_state = 8848, max_depth= 10, n_estimators=300)
model.fit(X_train, y_train)
df_res = update_results(df_res, model, X_train, y_train,'RF : Body Only [CV]')
df_res

**Few words:**
- The GridSearchCV takes the default score as *accuracy* I guess.
- We may have to use a differnt scoring as well.