# Train model to deploy
We have decided to use RandomForestClassifier as the final model due to the impressive f1-score and because it is less expensive than XGBoost

In [13]:
import pandas as pd
import tqdm 
import numpy as np 

import nltk 
import re
import gensim

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from praw.models import MoreComments

from sklearn.pipeline import Pipeline, FeatureUnion
import skipthoughts
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score


In [2]:
df_india = pd.read_csv('clean_reddit_india.csv')
df_india.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,score,id,url,num_comments,body,created,Comment,Flair,new_feature
0,0,0,"['untouchability', 'even', 'quarantine', 'neve...",58,fzvwz8,['httpswwwtelegraphindiacomindiacoronavirusout...,6,['nan'],1586723000.0,"['let', 'feel', 'hungry', 'couple', 'day', 'ma...",Scheduled,"['untouchability', 'even', 'quarantine', 'neve..."
1,1,1,"['delhi', 'govt', 'source', 'name', 'cm', 'arv...",304,f7ogd8,['httpstwittercomanistatus1231093900518932480s...,30,['nan'],1582380000.0,"['beyond', 'petty', 'inclusion', 'delhi', 'gov...",Scheduled,"['delhi', 'govt', 'source', 'name', 'cm', 'arv..."
2,2,2,"['delhi', 'ap', 'singh', 'advocate', '2012', '...",16,flgvah,['httpstwittercomanistatus1240731289075871744s...,19,['nan'],1584678000.0,"['hunch', 'guy', 'trying', 'expose', 'loophole...",Scheduled,"['delhi', 'ap', 'singh', 'advocate', '2012', '..."
3,3,3,"['100', 'quota', 'tribal', 'teacher', 'school'...",18,g698qu,['httpswwwthehinducomnewsnationalno100quotafor...,2,['nan'],1587618000.0,"['sc', 'point', '100', 'quota', 'ok', 'thats',...",Scheduled,"['100', 'quota', 'tribal', 'teacher', 'school'..."
4,4,4,"['supreme', 'court', '’', 'verdict', 'scst', '...",105,f1o839,['httpsscrollinarticle952687whythesupremecourt...,47,['nan'],1581358000.0,"['muslim', 'reservation', 'two', 'distraction'...",Scheduled,"['supreme', 'court', '’', 'verdict', 'scst', '..."


In [3]:
print(df_india.columns)
df_india.drop(columns=['Unnamed: 0'], inplace=True)
df_india = df_india.rename(columns={'Unnamed: 0.1':'Index'})


Index(['Unnamed: 0', 'Unnamed: 0.1', 'title', 'score', 'id', 'url',
       'num_comments', 'body', 'created', 'Comment', 'Flair', 'new_feature'],
      dtype='object')


In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_india['Flair_cat'] = le.fit_transform(df_india['Flair'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

print(le_name_mapping)
with open('pickles/le_dict.pkl', 'wb') as f:
    pickle.dump(le_name_mapping, f)


{'AskIndia': 0, 'Business/Finance': 1, 'CAA-NRC-NPR': 2, 'Coronavirus': 3, 'Food': 4, 'Non-Political': 5, 'Photography': 6, 'Politics': 7, 'Scheduled': 8, 'Science/Technology': 9, 'Sports': 10}


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', 
        encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [6]:
features_train = tfidf.fit_transform(df_india['new_feature']).toarray()
labels_train = df_india['Flair_cat']
print(features_train.shape)

(2107, 3830)


In [7]:
from sklearn.feature_selection import chi2
import numpy as np

column_values = df_india['Flair_cat'].values
flair_list = pd.unique(column_values)

N = 5
for flair_cat in sorted(flair_list):
    print(flair_cat)
    features_chi2 = chi2(features_train, labels_train==flair_cat)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [word for word in feature_names if len(word.split(' '))==1]
    bigrams = [word for word in feature_names if len(word.split(' '))==2]
    flair = df_india.loc[lambda df_india:df_india['Flair_cat']==flair_cat]
    print(" '{}':".format(flair['Flair'].iloc[0]))
    print("Most correlated unigrams: \n{}".format('\n'.join(unigrams[-N:])))
    print("Most correlated bigrams: \n{}".format('\n'.join(bigrams[-N:])))

0
 'AskIndia':
Most correlated unigrams: 
flat
subreddit
rid
aunty
askindia
Most correlated bigrams: 
doesnt matter
spend time
dont use
ask india
askindia need
1
 'Business/Finance':
Most correlated unigrams: 
week
2016
ending
business
finance
Most correlated bigrams: 
indian startup
indian economy
week ending
economy week
week indian
2
 'CAA-NRC-NPR':
Most correlated unigrams: 
caanprnrc
protest
caa
nrc
npr
Most correlated bigrams: 
join protest
npr nrc
caa nrc
caa npr
nrc npr
3
 'Coronavirus':
Most correlated unigrams: 
case
lockdown
covid19
testing
coronavirus
Most correlated bigrams: 
coronavirus nan
coronavirus pandemic
india coronavirus
coronavirus lockdown
coronavirus case
4
 'Food':
Most correlated unigrams: 
migrant
eat
lockdown
delivery
food
Most correlated bigrams: 
food supply
provide food
free food
indian food
food delivery
5
 'Non-Political':
Most correlated unigrams: 
rindia
celebrating
political
non
nonpolitical
Most correlated bigrams: 
right thing
year old
im bot
nonp

In [10]:
estimator = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=70, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1100,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [11]:
estimator.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=70, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1100,
                       n_jobs=None, oob_score=False, random_state=8, verbose=0,
                       warm_start=False)

In [15]:
print("The training accuracy is: ")
print(accuracy_score(labels_train, estimator.predict(features_train)))

The training accuracy is: 
0.912672045562411


In [16]:
with open('pickles_deploy/rfc_estimator.pkl', 'wb') as f:
    pickle.dump(estimator, f)

In [18]:
with open('pickles_deploy/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)