# Modeling
These are the three following models used to classify subreddits:
- Logistic Regression 
- Naive Bayes Classification(2)
- Random Forest 
- GradientBooster


## Starting with Logistic Regression 

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train = pd.read_csv('./datasets/train_snow.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos.1,compound.1
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182,0.659,0.159,-0.5346
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.172,0.699,0.129,-0.8987
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.074,0.895,0.031,-0.7472
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.144,0.727,0.129,-0.3008
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.749,0.161,0.9925


In [3]:
test = pd.read_csv('./datasets/test_snow.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos.1,compound.1
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.718,0.149,0.3031
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.163,0.748,0.089,-0.9936
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.681,0.186,0.7812
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.179585,0.0,0.0,0.0,0.169,0.801,0.03,-0.9182
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.828,0.032,-0.8176


In [4]:
train.shape

(1418, 6757)

In [5]:
test.shape

(473, 6757)

In [6]:
#dropping unnamed columns
train.drop('Unnamed: 0', axis = 1, inplace= True)
test.drop('Unnamed: 0', axis = 1, inplace= True)

In [7]:
train.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos.1,compound.1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.182,0.659,0.159,-0.5346
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.172,0.699,0.129,-0.8987
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.074,0.895,0.031,-0.7472
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.144,0.727,0.129,-0.3008
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09,0.749,0.161,0.9925


In [8]:
test.head()

Unnamed: 0,00,000,000000001,00am,00pm,03,05,06,07,10,...,zero,zinn,zoloft,zombi,zone,zx,neg,neu,pos.1,compound.1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.718,0.149,0.3031
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.163,0.748,0.089,-0.9936
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.133,0.681,0.186,0.7812
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.179585,0.0,0.0,0.0,0.169,0.801,0.03,-0.9182
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.828,0.032,-0.8176


## Model Prep

In [9]:
#features are everything except for subreddit
features = [col for col in train.columns if col!= 'subreddit']
X_train = train[features]

In [10]:
#features are everything except for subreddit
features = [col for col in test.columns if col!= 'subreddit']
X_test = test[features]

In [11]:
#y variable is subreddit
y_train = train['subreddit']

In [12]:
#y variable is subreddit
y_test = test['subreddit']

## Instantiate Model

In [13]:
logreg = LogisticRegression()

In [14]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
logreg.score(X_train, y_train)

0.9414668547249647

In [16]:
logreg.score(X_test, y_test)

0.8837209302325582

#### Confusion Matrix for Logistic Regression Model

In [17]:
pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [18]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,218,24
actual anxiety,31,200


#### F1 Score 

In [19]:
f1_score(y_test, pred)

0.8791208791208792

## Naive Bayes Time
- Under Naive Bayes, we will use 2 models: Gaussian and Bernoulli

In [20]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

### Gaussian Naive Bayes & Bernoulli

In [21]:
g = GaussianNB()
b = BernoulliNB()

In [22]:
cross_val_score(g, X_train, y_train, cv = 5).mean()

0.6312351687460328

In [23]:
cross_val_score(b, X_train, y_train, cv = 5).mean()

0.75676700710119

## Gaussian

In [24]:
g.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
g.score(X_train, y_train)

0.9464033850493653

In [26]:
g.score(X_test, y_test)

0.6088794926004228

#### Confusion Matrix for Gaussian Model

In [27]:
pred = g.predict(X_test)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [28]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,137,105
actual anxiety,80,151


#### F1 Score

In [29]:
f1_score(y_test, pred)

0.6201232032854209

## Bernoulli

In [30]:
b.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [31]:
b.score(X_train, y_train)

0.88787023977433

In [32]:
b.score(X_test, y_test)

0.773784355179704

#### Confusion Matrix for Bernoulli Model

In [33]:
pred = b.predict(X_test)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [34]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,166,76
actual anxiety,31,200


#### F1 Score

In [35]:
f1_score(y_test, pred)

0.7889546351084812

## Random Forest 

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
rf.score(X_train, y_train)

0.9964739069111425

In [39]:
rf.score(X_test, y_test)

0.7167019027484144

#### Confusion Matrix for Random Forest 

In [40]:
pred = rf.predict(X_test)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [41]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,201,41
actual anxiety,93,138


#### F1 Score

In [42]:
f1_score(y_test, pred)

0.6731707317073171

## Gradient Booster

In [43]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [44]:
gb.score(X_train, y_train)

0.96262341325811

In [45]:
gb.score(X_test, y_test)

0.8498942917547568

#### Confusion Matrix for Gradient Booster 

In [46]:
pred = gb.predict(X_test)
cm = confusion_matrix(y_test, pred)

#### Confusion Dataframe

In [47]:
cm_df = pd.DataFrame(cm, 
             columns = ['predicted depression', 'predicted anxiety'], 
             index = ['actual depression', 'actual anxiety'])

cm_df

Unnamed: 0,predicted depression,predicted anxiety
actual depression,192,50
actual anxiety,21,210


#### F1 Score

In [48]:
f1_score(y_test, pred)

0.8553971486761711