#### Explored all the classifiers we learned so far with cross_val_score with their default parameters, and select the top 3 to do Gridsearch on

In [1]:
import pandas as pd
import nltk
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier,\
VotingClassifier, RandomForestClassifier,ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,SVC

In [2]:
nut_die=pd.read_csv('/Users/lettywu/dsi/Projects/project_3/project_3_letty/data/nutrition_vs_dietetics.csv')

In [3]:
nut_die.head()

Unnamed: 0,nd_subreddit,nd_title_text,nd_length,nd_word_count,nd_sentiment
0,1,Please help me figure out what’s wrong with my...,84,12,0.2942
1,1,What Tests For Iron Deficiency? Looking at get...,335,54,0.6544
2,1,How to get protein as a picky eater? [removed],46,9,0.0
3,1,Suffering with canker sores [removed],37,5,-0.4767
4,1,Good diets/work outs? I am trying to lose weig...,204,39,0.296


### Baseline

In [4]:
nut_die['nd_subreddit'].value_counts(normalize=True)

0    0.502927
1    0.497073
Name: nd_subreddit, dtype: float64

### Set X and y variables

In [5]:
X=nut_die['nd_title_text']
y=nut_die['nd_subreddit']

## All the classification models I could use
- Logistic Regression
- Knn
- Multinomial Naive Bayes
- Bagging
- DecisionTree,RandomForest,ExtraTree
- Boosting(AdaBoost,GrandientBoost,XGBoost)
- VotingClassifier
- SVMs(LinearSVC,SVC)

In [6]:
# add customized stopwords into 'english' stopwords

stopwords = nltk.corpus.stopwords.words('english')
newstopwords=['get','im','removed','ive','dont','rd','would','nutrition','deleted',"'d", "'ll", "'re", "'s", "'ve", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo']
stopwords.extend(newstopwords)

In [7]:
#create a function here that would return corss_val_score for each classifier

def classifier(estimator):
    pipe_cvec=Pipeline([('cvec',CountVectorizer(stop_words=stopwords)),
                  #('ss',StandardScaler(with_mean=False)),
                  ('estimator',estimator)])
    score_cvec=cross_val_score(pipe_cvec,X,y).mean()
    
    pipe_tvec=Pipeline([('tvec',TfidfVectorizer(stop_words=stopwords)),
                  #('ss',StandardScaler(with_mean=False)),
                  ('estimator',estimator)])
    score_tvec=cross_val_score(pipe_tvec,X,y).mean()
    
    print(f'{estimator}:score_cvec {round(score_cvec,4)}\n score_tvec {round(score_tvec,4)}')

**I tried to use StandardScaler for all of my classifier, the scores come out are worse than without StandardScaler**

### Score for each estimators

In [8]:
print (round(0.8333333,4)) # try out round()

0.8333


In [9]:
classifier(LogisticRegression(max_iter=10000))

LogisticRegression(max_iter=10000):score_cvec 0.8816
 score_tvec 0.8921


In [10]:
classifier(Pipeline([('ss', StandardScaler(with_mean=False)),
                   ('knn',KNeighborsClassifier())]))

Pipeline(steps=[('ss', StandardScaler(with_mean=False)),
                ('knn', KNeighborsClassifier())]):score_cvec 0.7012
 score_tvec 0.6299


In [11]:
classifier(MultinomialNB())

MultinomialNB():score_cvec 0.8936
 score_tvec 0.8908


In [12]:
classifier(BaggingClassifier())

BaggingClassifier():score_cvec 0.8195
 score_tvec 0.8537


In [13]:
classifier(DecisionTreeClassifier())

DecisionTreeClassifier():score_cvec 0.7964
 score_tvec 0.818


In [14]:
classifier(RandomForestClassifier())

RandomForestClassifier():score_cvec 0.871
 score_tvec 0.8766


In [15]:
classifier(ExtraTreesClassifier())

ExtraTreesClassifier():score_cvec 0.8676
 score_tvec 0.89


In [16]:
classifier(AdaBoostClassifier())

AdaBoostClassifier():score_cvec 0.8478
 score_tvec 0.8397


In [17]:
classifier(GradientBoostingClassifier())

GradientBoostingClassifier():score_cvec 0.8501
 score_tvec 0.8468


In [18]:
classifier(XGBClassifier(use_label_encoder=False))

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None,
              verbosity=None):score_cvec 0.8753
 score_tvec 0.872


In [19]:
knn_pipe=Pipeline([('ss', StandardScaler(with_mean=False)),
                   ('knn',KNeighborsClassifier())])
vote=VotingClassifier([('LSVC',LinearSVC(max_iter=10000)),
                      ('SVC',SVC()),
                       ('Ada',AdaBoostClassifier()),
                      ('Gra',GradientBoostingClassifier()),
                      ('DecT',DecisionTreeClassifier()),
                      ('knn_pipe',knn_pipe),
                      ('RaF',RandomForestClassifier()),
                      ('XGB',XGBClassifier(use_label_encoder=False)),
                      ('ExT',ExtraTreesClassifier()),
                      ('Bag',BaggingClassifier()),
                      ('Log',LogisticRegression()),
                      ('MNB',MultinomialNB())])
classifier(vote)

VotingClassifier(estimators=[('LSVC', LinearSVC(max_iter=10000)),
                             ('SVC', SVC()), ('Ada', AdaBoostClassifier()),
                             ('Gra', GradientBoostingClassifier()),
                             ('DecT', DecisionTreeClassifier()),
                             ('knn_pipe',
                              Pipeline(steps=[('ss',
                                               StandardScaler(with_mean=False)),
                                              ('knn',
                                               KNeighborsClassifier())])),
                             ('RaF', RandomForestClassifier()),
                             ('XGB',
                              XGBClassifier(base_score=None, b...
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                       

In [20]:
classifier(LinearSVC(max_iter=10000))

LinearSVC(max_iter=10000):score_cvec 0.8598
 score_tvec 0.8888


In [21]:
classifier(SVC())

SVC():score_cvec 0.8509
 score_tvec 0.8964


---

## Create a dataframe to compare all the scores

In [22]:
classifier_list=[LinearSVC(max_iter=10000),SVC(),AdaBoostClassifier(),
                 GradientBoostingClassifier(),DecisionTreeClassifier(),
                 Pipeline([('ss', StandardScaler(with_mean=False)),('knn',KNeighborsClassifier())]),
                 RandomForestClassifier(),XGBClassifier(use_label_encoder=False),
                 ExtraTreesClassifier(),BaggingClassifier(),LogisticRegression(),
                 MultinomialNB(),vote]

In [23]:
classifier_list[0]

LinearSVC(max_iter=10000)

In [24]:
estimator_list=[]  #create empty lists for estimators and scores
score_cvec_list=[]
score_tvec_list=[]
    
for i in classifier_list:
    estimator_list.append(i)
    
    pipe_cvec=Pipeline([('cvec',CountVectorizer(stop_words=stopwords)),
                  ('estimator',i)])
    score_cvec=cross_val_score(pipe_cvec,X,y).mean()
    score_cvec_list.append(round(score_cvec,4))
    
    pipe_tvec=Pipeline([('tvec',TfidfVectorizer(stop_words=stopwords)),
                  ('estimator',i)])
    score_tvec=cross_val_score(pipe_tvec,X,y).mean()
    score_tvec_list.append(round(score_tvec,4))
    
    #print(estimator_list,score_cvec_list,score_tvec_list)



In [25]:
#create a dataframe with the estimators and their scores

scores_df=pd.DataFrame({'estimator':estimator_list,'score_cvec':score_cvec_list,
                       'score_tvec':score_tvec_list}, index=None)

In [26]:
#create a new column that take average score of score from CountVectorizer 
#and score from TfidfVectorizer

scores_df['score_avg']=round(scores_df.mean(axis=1),4)

In [39]:
# create a list of the classifier's name to add into the df, easier to read

classifier_list_sim=['LinearSVC','SVC','AdaBoostClassifier','GradientBoostingClassifier',
                    'DecisionTreeClassifier','KNeighborsClassifier','RandomForestClassifier',
                    'XGBClassifier','ExtraTreesClassifier','BaggingClassifier',
                    'LogisticRegression','MultinomialNB','VotingClassifier']

In [40]:
scores_df['estimator']=classifier_list_sim

In [42]:
scores_df.sort_values(by=['score_avg'],ascending=False)

Unnamed: 0,estimator,score_cvec,score_tvec,score_avg
12,VotingClassifier,0.8903,0.8959,0.8931
11,MultinomialNB,0.8936,0.8908,0.8922
10,LogisticRegression,0.8816,0.8921,0.8868
8,ExtraTreesClassifier,0.8715,0.885,0.8782
6,RandomForestClassifier,0.8745,0.8814,0.878
0,LinearSVC,0.8598,0.8888,0.8743
1,SVC,0.8509,0.8964,0.8736
7,XGBClassifier,0.8753,0.872,0.8736
3,GradientBoostingClassifier,0.8493,0.8486,0.849
2,AdaBoostClassifier,0.8478,0.8397,0.8438


`looks like the VotingClassifier and the Miltinomial Naive Bayes Classifier are really good just with the default parameters` 

`I am using the default metric which is accuracy for classification, because Accuracy is used when the True Positives and True negatives are more important while F1-score is used when the False Negatives and False Positives are crucial.`

`Also, I want to maximize my TP and TN for my problem statements, I want to predit Nutrition subreddit that is Nutrition subreddit, and predit Dietetics subreddit that is Dietetics subreddit`

https://medium.com/analytics-vidhya/accuracy-vs-f1-score-6258237beca2