# Modeling by GridSearching
These are the three following models used to classify subreddits:
- Logistic Regression 
- Naive Bayes Classification(2)
- Random Forest 
- GradientBooster

# Objective
- Gridsearch to find optimal parameters to ultimately execute VoteClassifier using the top 3 models

## Starting with Logistic Regression 

In [1]:
import nltk

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('./datasets/df_vader.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,subreddit,stringOG,vader,neg,neu,pos,compound
0,0,0,1,we know that septemb 10 was world suicid preve...,"{'neg': 0.054, 'neu': 0.75, 'pos': 0.196, 'com...",0.054,0.75,0.196,0.7506
1,1,1,1,welcom to r depress check in post a place to t...,"{'neg': 0.062, 'neu': 0.779, 'pos': 0.159, 'co...",0.062,0.779,0.159,0.9965
2,2,2,1,i m go to the movi i m so nervous i m will lea...,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp...",0.146,0.854,0.0,-0.4101
3,3,3,1,now i can save so i can get myself out of this...,"{'neg': 0.0, 'neu': 0.669, 'pos': 0.331, 'comp...",0.0,0.669,0.331,0.836
4,4,4,1,i alway do this i ll stay up until the wee hou...,"{'neg': 0.103, 'neu': 0.809, 'pos': 0.088, 'co...",0.103,0.809,0.088,-0.0258


In [4]:
df.shape

(1891, 9)

In [5]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace= True)

In [6]:
df.head()

Unnamed: 0,subreddit,stringOG,vader,neg,neu,pos,compound
0,1,we know that septemb 10 was world suicid preve...,"{'neg': 0.054, 'neu': 0.75, 'pos': 0.196, 'com...",0.054,0.75,0.196,0.7506
1,1,welcom to r depress check in post a place to t...,"{'neg': 0.062, 'neu': 0.779, 'pos': 0.159, 'co...",0.062,0.779,0.159,0.9965
2,1,i m go to the movi i m so nervous i m will lea...,"{'neg': 0.146, 'neu': 0.854, 'pos': 0.0, 'comp...",0.146,0.854,0.0,-0.4101
3,1,now i can save so i can get myself out of this...,"{'neg': 0.0, 'neu': 0.669, 'pos': 0.331, 'comp...",0.0,0.669,0.331,0.836
4,1,i alway do this i ll stay up until the wee hou...,"{'neg': 0.103, 'neu': 0.809, 'pos': 0.088, 'co...",0.103,0.809,0.088,-0.0258


## Train Test Split

In [7]:
features = ['stringOG', 'vader']
X = df[features]
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify =y)

In [8]:
X_train_p1 = X_train['stringOG']
X_test_p1 = X_test['stringOG']

## Creating Class to append vader columns before modeling after tfidf

In [9]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

class Appender(TransformerMixin):
    def __init__(self, *_):
        pass
   
        
    def transform(self, X, *_):
        X = pd.DataFrame(X.toarray())
        #if score is train:
        X['neg'] = X_train.reset_index()['vader'].apply(lambda x: eval(x).get('neg'))  #eval takes string and evaluates as python code
        X['neu'] = X_train.reset_index()['vader'].apply(lambda x: eval(x).get('neu'))
        X['pos'] = X_train.reset_index()['vader'].apply(lambda x: eval(x).get('pos'))
        X['compound'] = X_train.reset_index()['vader'].apply(lambda x: eval(x).get('compound'))
        #else: use X_test vader 
        return X
    
    def fit(self, *_):
        return self
    

## Let's Try to Pipeline our GridSearch to Get a Better Score

## Starting with Logistic

In [10]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('append', Appender()),
    ('lr', LogisticRegression())
])

In [13]:


pipe_params = {
    'lr__penalty': ['l1', 'l2'],
    'lr__C': [0.2, 0.6, 1.0]
    #can add C parameter for logisitic regression
}


gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)


gs.fit(X_train_p1, y_train)


print(gs.best_score_)
gs.best_params_



0.8522906793048973


{'lr__C': 1.0, 'lr__penalty': 'l2'}

In [14]:
gs.score(X_train_p1, y_train)

0.943127962085308

In [15]:
gs.score(X_test_p1, y_test)

#how to compensate for test vader...isnt it using the vader score for train? 
#what happens to test vader?

0.848

In [17]:
tfidf = TfidfVectorizer()

tfidf.fit(X_train_p1, y_train)

train = tfidf.transform(X_train_p1)
test = tfidf.transform(X_test_p1)

train_df = pd.DataFrame(train.toarray(), columns= tfidf.get_feature_names())
test_df = pd.DataFrame(test.toarray(), columns= tfidf.get_feature_names())

In [18]:
X_train['neg'] =  X_train['vader'].apply(lambda x: eval(x).get('neg'))
X_train['neu'] =  X_train['vader'].apply(lambda x: eval(x).get('neu'))
X_train['pos'] =  X_train['vader'].apply(lambda x: eval(x).get('pos'))
X_train['compound'] =  X_train['vader'].apply(lambda x: eval(x).get('compound'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [19]:
features = ['neg', 'neu', 'pos', 'compound']
X_train_vader = X_train[features]

In [20]:
X_test['neg'] =  X_test['vader'].apply(lambda x: eval(x).get('neg'))
X_test['neu'] =  X_test['vader'].apply(lambda x: eval(x).get('neu'))
X_test['pos'] =  X_test['vader'].apply(lambda x: eval(x).get('pos'))
X_test['compound'] =  X_test['vader'].apply(lambda x: eval(x).get('compound'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [21]:
features = ['neg', 'neu', 'pos', 'compound']
X_test_vader = X_test[features]

In [22]:
X_train_df = pd.concat(objs=[train_df, X_train_vader.reset_index()], axis = 1).drop('index', axis = 1)

In [23]:
X_train_df.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.281,0.634,0.086,-0.8873
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.137,0.827,0.037,-0.8224
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.053,0.838,0.109,0.7482
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.148,0.815,0.037,-0.8271
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.085,0.84,0.076,-0.8956


In [24]:
X_test_df = pd.concat(objs=[test_df, X_test_vader.reset_index()], axis = 1).drop('index', axis = 1)

#### Logistic Regression 

In [27]:
logreg = LogisticRegression(C = 1.0, penalty = 'l2')
logreg.fit(X_train_df, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
logreg.score(X_train_df, y_train)

0.943127962085308

In [29]:
logreg.score(X_test_df, y_test)

0.8528

#### Confusion Matrix for Logistic Regression Model

In [30]:
pred = logreg.predict(X_test_df)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [31]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,273,47
actual anxiety,45,260


#### F1 Score 

In [32]:
f1_score(y_test, pred)

0.8496732026143791

## Random Forest 

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('append', Appender()),
    ('rf', RandomForestClassifier())
])

pipe_params = {
    'rf__max_features': [None, 'log2', 'auto'],
    'rf__max_depth': [3, 4, 5],
    'rf__n_estimators': [100, 200, 300]
    
}


gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)


gs.fit(X_train_p1, y_train)


print(gs.best_score_)
gs.best_params_

0.8333333333333334


{'rf__max_depth': 5, 'rf__max_features': None, 'rf__n_estimators': 100}

In [40]:
gs.score(X_train_p1, y_train)

0.8767772511848341

In [41]:
gs.score(X_test_p1, y_test)

0.8432

In [42]:
tfidf = TfidfVectorizer()

tfidf.fit(X_train_p1, y_train)

train = tfidf.transform(X_train_p1)
test = tfidf.transform(X_test_p1)

train_df = pd.DataFrame(train.toarray(), columns= tfidf.get_feature_names())
test_df = pd.DataFrame(test.toarray(), columns= tfidf.get_feature_names())

In [43]:
X_train_df = pd.concat(objs=[train_df, X_train_vader.reset_index()], axis = 1).drop('index', axis = 1)

In [44]:
X_test_df = pd.concat(objs=[test_df, X_test_vader.reset_index()], axis = 1).drop('index', axis = 1)

In [45]:
rf = RandomForestClassifier(max_depth= 5, max_features= None, n_estimators= 100)
rf.fit(X_train_df, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
rf.score(X_train_df, y_train)

0.8751974723538705

In [47]:
rf.score(X_test_df, y_test)

0.8416

#### Confusion Matrix for Random Forest 

In [48]:
pred = rf.predict(X_test_df)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [49]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,242,78
actual anxiety,21,284


#### F1 Score

In [50]:
f1_score(y_test, pred)

0.8515742128935532

## Gradient Booster

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

In [53]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('append', Appender()),
    ('gb', GradientBoostingClassifier())
])

pipe_params = {
    'gb__learning_rate': [0.05, 0.1, 0.15],
    'gb__max_depth': [3, 4, 5],
    'gb__n_estimators': [100, 200, 300]
    
}


gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)


gs.fit(X_train_p1, y_train)


print(gs.best_score_)
gs.best_params_

0.8522906793048973


{'gb__learning_rate': 0.1, 'gb__max_depth': 3, 'gb__n_estimators': 300}

In [54]:
gs.score(X_train_p1, y_train)

1.0

In [55]:
gs.score(X_test_p1, y_test)

0.8528

In [56]:
tfidf = TfidfVectorizer()

tfidf.fit(X_train_p1, y_train)

train = tfidf.transform(X_train_p1)
test = tfidf.transform(X_test_p1)

train_df = pd.DataFrame(train.toarray(), columns= tfidf.get_feature_names())
test_df = pd.DataFrame(test.toarray(), columns= tfidf.get_feature_names())

In [57]:
X_train_df = pd.concat(objs=[train_df, X_train_vader.reset_index()], axis = 1).drop('index', axis = 1)

In [58]:
X_test_df = pd.concat(objs=[test_df, X_test_vader.reset_index()], axis = 1).drop('index', axis = 1)

In [60]:
gb = GradientBoostingClassifier(learning_rate= 0.1, max_depth= 3, n_estimators= 300)
gb.fit(X_train_df, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [61]:
gb.score(X_train_df, y_train)

1.0

In [62]:
gb.score(X_test_df, y_test)

0.8512

#### Confusion Matrix for Gradient Booster 

In [63]:
pred = gb.predict(X_test_df)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [64]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,262,58
actual anxiety,35,270


#### F1 Score

In [65]:
f1_score(y_test, pred)

0.8530805687203791