### Import libraries

In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.preprocessing import FunctionTransformer

### Load data

In [5]:
df = pd.read_csv('../data/cleaned_train_data.csv')

In [8]:
X = df['text']
y = df['label']

### Split data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.3)

## Modeling with CountVectorizer

### Logistic Regression

In [117]:
pipe_cvec_lr = Pipeline([("cvec", CountVectorizer()),
                          ('ss', StandardScaler(with_mean=False)),
                         ("lr", LogisticRegression(max_iter = 2000))])
pipe_cvec_lr_params = {'cvec__min_df': [1,2,3],
                       "cvec__ngram_range": [(1,1),(1,2)]
                       }
gs_cvec_lr = GridSearchCV(pipe_cvec_lr,
                          param_grid = pipe_cvec_lr_params,
                          cv = 5)
gs_cvec_lr.fit(X_train, y_train)
gs_cvec_lr_model = gs_cvec_lr.best_estimator_
gs_cvec_lr.best_params_

{'cvec__min_df': 1, 'cvec__ngram_range': (1, 2)}

In [118]:
gs_cvec_lr.best_score_

0.9089795604226453

In [119]:
gs_cvec_lr_model.score(X_train, y_train)

0.9995309568480301

In [120]:
gs_cvec_lr_model.score(X_test, y_test)

0.9311475409836065

In [121]:
pipe_tvec_lr = Pipeline([("tvec",  TfidfVectorizer()),
                          ('ss', StandardScaler(with_mean=False)),
                         ("lr", LogisticRegression(max_iter = 2000))])
pipe_tvec_lr_params = {'tvec__min_df': [1,2,3],
                       "tvec__ngram_range": [(1,1),(1,2)]
                       }
gs_tvec_lr = GridSearchCV(pipe_tvec_lr,
                          param_grid = pipe_tvec_lr_params,
                          cv = 5)
gs_tvec_lr.fit(X_train, y_train)
gs_tvec_lr_model = gs_tvec_lr.best_estimator_
gs_tvec_lr.best_params_

{'tvec__min_df': 1, 'tvec__ngram_range': (1, 2)}

In [122]:
gs_tvec_lr.best_score_

0.8944508581543907

In [123]:
gs_tvec_lr_model.score(X_train, y_train)

0.9995309568480301

In [124]:
gs_tvec_lr_model.score(X_test, y_test)

0.912568306010929

### Naive Bayes - Multinomial 

In [125]:
pipe_cvec_mnb = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("mnb", MultinomialNB())])
pipe_cvec_mnb_params = {'cvec__min_df': [1,2,3],
                        "cvec__ngram_range": [(1,1),(1,2)],
                        }
gs_cvec_mnb = GridSearchCV(pipe_cvec_mnb,
                           param_grid = pipe_cvec_mnb_params,
                           cv = 5)
gs_cvec_mnb.fit(X_train, y_train)
gs_cvec_mnb_model = gs_cvec_mnb.best_estimator_
gs_cvec_mnb.best_params_

{'cvec__min_df': 1, 'cvec__ngram_range': (1, 2)}

In [126]:
gs_cvec_mnb.best_score_

0.8935140900045079

In [127]:
gs_cvec_mnb_model.score(X_train,y_train)

0.9770168855534709

In [128]:
gs_cvec_mnb_model.score(X_test,y_test)

0.9114754098360656

### Naive Bayes - Gaussian 

In [129]:
pipe_cvec_gnb = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("gnb", GaussianNB())])
pipe_cvec_gnb_params = {'cvec__min_df': [1,2,3],
                        "cvec__ngram_range": [(1,1),(1,2)],
                        }
gs_cvec_gnb = GridSearchCV(pipe_cvec_gnb,
                           param_grid = pipe_cvec_gnb_params,
                           cv = 5)
gs_cvec_gnb.fit(X_train, y_train)
gs_cvec_gnb_model = gs_cvec_gnb.best_estimator_
gs_cvec_gnb.best_params_

{'cvec__min_df': 2, 'cvec__ngram_range': (1, 2)}

In [130]:
gs_cvec_gnb.best_score_

0.8362953678354279

In [131]:
gs_cvec_gnb_model.score(X_train,y_train)

0.9446529080675422

In [132]:
gs_cvec_gnb_model.score(X_test,y_test)

0.8568306010928962

### Naive Bayes - Bernoulli 

In [133]:
pipe_cvec_bnb = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("bnb", BernoulliNB())])
pipe_cvec_bnb_params = {'cvec__min_df': [1,2,3],
                        "cvec__ngram_range": [(1,1),(1,2)],
                        }
gs_cvec_bnb = GridSearchCV(pipe_cvec_bnb,
                           param_grid = pipe_cvec_bnb_params,
                           cv = 5)
gs_cvec_bnb.fit(X_train, y_train)
gs_cvec_bnb_model = gs_cvec_bnb.best_estimator_
gs_cvec_bnb.best_params_

{'cvec__min_df': 2, 'cvec__ngram_range': (1, 1)}

In [134]:
gs_cvec_bnb.best_score_

0.9169596815867885

In [135]:
gs_cvec_bnb_model.score(X_train,y_train)

0.9460600375234521

In [136]:
gs_cvec_bnb_model.score(X_test,y_test)

0.9245901639344263

### Decision Tree Classifier

In [137]:
pipe_cvec_dt_bag = Pipeline([("cvec", CountVectorizer()),
                             ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                             ("bag", BaggingClassifier(base_estimator = DecisionTreeClassifier()))])
pipe_cvec_dt_bag_params = {"bag__base_estimator__max_depth": [40],
                           "bag__base_estimator__min_samples_leaf": [5],
                           "bag__base_estimator__min_samples_split": [40],
                           'cvec__min_df': [1,2,3],
                           'cvec__ngram_range': [(1, 1),(1,2)],
                           "bag__n_estimators": [200],
                           "bag__max_samples": [.5],
                           "bag__max_features": [.4]}
gs_cvec_dt_bag = GridSearchCV(pipe_cvec_dt_bag,
                              param_grid = pipe_cvec_dt_bag_params,
                              cv = 5)
gs_cvec_dt_bag.fit(X_train, y_train)
gs_cvec_dt_bag_model = gs_cvec_dt_bag.best_estimator_
gs_cvec_dt_bag.best_params_

{'bag__base_estimator__max_depth': 40,
 'bag__base_estimator__min_samples_leaf': 5,
 'bag__base_estimator__min_samples_split': 40,
 'bag__max_features': 0.4,
 'bag__max_samples': 0.5,
 'bag__n_estimators': 200,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2)}

In [138]:
gs_cvec_dt_bag.best_score_

0.9235313520467064

In [139]:
gs_cvec_dt_bag_model.score(X_train, y_train)

0.9343339587242027

In [140]:
gs_cvec_dt_bag_model.score(X_test, y_test)

0.9431693989071038

### Random Forest 

In [153]:
pipe_cvec_rf = Pipeline([("cvec", CountVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("rf", RandomForestClassifier())])
pipe_cvec_rf_params = {'cvec__min_df': [1,2,3],
                       'cvec__ngram_range': [(1, 1),(1,2)],
                       "rf__n_estimators": [70],
                       "rf__max_depth": [None]}
gs_cvec_rf = GridSearchCV(pipe_cvec_rf,
                          param_grid = pipe_cvec_rf_params,
                          cv = 5)
gs_cvec_rf.fit(X_train, y_train)
gs_cvec_rf_model = gs_cvec_rf.best_estimator_
gs_cvec_rf.best_params_

{'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'rf__max_depth': None,
 'rf__n_estimators': 70}

In [154]:
gs_cvec_rf.best_score_

0.9268111400644303

In [155]:
gs_cvec_rf_model.score(X_train,y_train)

0.9995309568480301

In [156]:
gs_cvec_rf_model.score(X_test,y_test)

0.9453551912568307

### Extra Trees Classifier

In [161]:
pipe_cvec_et = Pipeline([("cvec", CountVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("et", ExtraTreesClassifier())])
pipe_cvec_et_params = {'cvec__min_df': [1,2,3],
                       'cvec__ngram_range': [(1, 1),(1,2)],
                       "et__n_estimators": [51],
                       "et__max_depth": [None]}
gs_cvec_et = GridSearchCV(pipe_cvec_et,
                          param_grid = pipe_cvec_et_params,
                          cv = 5)
gs_cvec_et.fit(X_train, y_train)
gs_cvec_et_model = gs_cvec_et.best_estimator_
gs_cvec_et.best_params_

{'cvec__min_df': 1,
 'cvec__ngram_range': (1, 2),
 'et__max_depth': None,
 'et__n_estimators': 51}

In [162]:
gs_cvec_et.best_score_

0.9221196028630801

In [163]:
gs_cvec_et_model.score(X_train,y_train)

0.9995309568480301

In [164]:
gs_cvec_et_model.score(X_test,y_test)

0.9486338797814208

## TFIDF Vectorizer

### Naive Bayes - Multinomial 

In [166]:
pipe_tvec_mnb = Pipeline([("tvec",TfidfVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("mnb", MultinomialNB())])
pipe_tvec_mnb_params = {'tvec__min_df': [1,2,3],
                        "tvec__ngram_range": [(1,1),(1,2)],
                        }
gs_tvec_mnb = GridSearchCV(pipe_tvec_mnb,
                           param_grid = pipe_tvec_mnb_params,
                           cv = 5)
gs_tvec_mnb.fit(X_train, y_train)
gs_tvec_mnb_model = gs_tvec_mnb.best_estimator_
gs_tvec_mnb.best_params_

{'tvec__min_df': 1, 'tvec__ngram_range': (1, 2)}

In [167]:
gs_tvec_mnb.best_score_

0.8874163010851996

In [168]:
gs_tvec_mnb_model.score(X_train,y_train)

0.9821763602251408

In [169]:
gs_tvec_mnb_model.score(X_test,y_test)

0.8939890710382513

### Naive Bayes - Gaussian

In [171]:
pipe_tvec_gnb = Pipeline([("tvec", TfidfVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("gnb", GaussianNB())])
pipe_tvec_gnb_params = {'tvec__min_df': [1,2,3],
                        "tvec__ngram_range": [(1,1),(1,2)],
                        }
gs_tvec_gnb = GridSearchCV(pipe_tvec_gnb,
                           param_grid = pipe_tvec_gnb_params,
                           cv = 5)
gs_tvec_gnb.fit(X_train, y_train)
gs_tvec_gnb_model = gs_tvec_gnb.best_estimator_
gs_tvec_gnb.best_params_

{'tvec__min_df': 2, 'tvec__ngram_range': (1, 2)}

In [172]:
gs_tvec_gnb.best_score_

0.8433343228771537

In [173]:
gs_tvec_gnb_model.score(X_train,y_train)

0.9582551594746717

In [174]:
gs_tvec_gnb_model.score(X_test,y_test)

0.8732240437158469

### Naive Bayes - Bernoulli

In [None]:
pipe_tvec_bnb = Pipeline([("tvec", TfidfVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("bnb", BernoulliNB())])
pipe_tvec_bnb_params = {'tvec__min_df': [1,2,3],
                        "tvec__ngram_range": [(1,1),(1,2)],
                        }
gs_tvec_bnb = GridSearchCV(pipe_tvec_bnb,
                           param_grid = pipe_tvec_bnb_params,
                           cv = 5)
gs_tvec_bnb.fit(X_train, y_train)
gs_tvec_bnb_model = gs_tvec_bnb.best_estimator_
gs_tvec_bnb.best_params_

In [177]:
gs_tvec_bnb.best_score_

0.9169596815867885

In [178]:
gs_tvec_bnb_model.score(X_train,y_train)

0.9460600375234521

In [179]:
gs_tvec_bnb_model.score(X_test,y_test)

0.9245901639344263

### Decision Tree Classifier

In [180]:
pipe_tvec_dt_bag = Pipeline([("tvec", TfidfVectorizer()),
                             ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                             ("bag", BaggingClassifier(base_estimator = DecisionTreeClassifier()))])
pipe_tvec_dt_bag_params = {"bag__base_estimator__max_depth": [40],
                           "bag__base_estimator__min_samples_leaf": [5],
                           "bag__base_estimator__min_samples_split": [40],
                           'tvec__min_df': [1,2,3],
                           'tvec__ngram_range': [(1, 1),(1,2)],
                           "bag__n_estimators": [200],
                           "bag__max_samples": [.5],
                           "bag__max_features": [.4]}
gs_tvec_dt_bag = GridSearchCV(pipe_tvec_dt_bag,
                              param_grid = pipe_tvec_dt_bag_params,
                              cv = 5)
gs_tvec_dt_bag.fit(X_train, y_train)
gs_tvec_dt_bag_model = gs_tvec_dt_bag.best_estimator_
gs_tvec_dt_bag.best_params_

{'bag__base_estimator__max_depth': 40,
 'bag__base_estimator__min_samples_leaf': 5,
 'bag__base_estimator__min_samples_split': 40,
 'bag__max_features': 0.4,
 'bag__max_samples': 0.5,
 'bag__n_estimators': 200,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1)}

In [181]:
gs_tvec_dt_bag.best_score_

0.9244670207034558

In [182]:
gs_tvec_dt_bag_model.score(X_train, y_train)

0.9376172607879925

In [183]:
gs_tvec_dt_bag_model.score(X_test, y_test)

0.9398907103825137

### Random Forest

In [184]:
pipe_tvec_rf = Pipeline([("tvec", TfidfVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("rf", RandomForestClassifier())])
pipe_tvec_rf_params = {'tvec__min_df': [1,2,3],
                       'tvec__ngram_range': [(1, 1),(1,2)],
                       "rf__n_estimators": [70],
                       "rf__max_depth": [None]}
gs_tvec_rf = GridSearchCV(pipe_tvec_rf,
                          param_grid = pipe_tvec_rf_params,
                          cv = 5)
gs_tvec_rf.fit(X_train, y_train)
gs_tvec_rf_model = gs_tvec_rf.best_estimator_
gs_tvec_rf.best_params_

{'rf__max_depth': None,
 'rf__n_estimators': 70,
 'tvec__min_df': 3,
 'tvec__ngram_range': (1, 2)}

In [196]:
gs_tvec_rf.best_score_

0.9296258424866137

In [186]:
gs_tvec_rf_model.score(X_train, y_train)

0.9985928705440901

In [187]:
gs_tvec_rf_model.score(X_test, y_test)

0.9431693989071038

### Extra Trees Classifier

In [192]:
pipe_tvec_et = Pipeline([("tvec", TfidfVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("et", ExtraTreesClassifier())])
pipe_tvec_et_params = {'tvec__min_df': [1,2,3],
                       'tvec__ngram_range': [(1, 1),(1,2)],
                       "et__n_estimators": [51],
                       "et__max_depth": [None]}
gs_tvec_et = GridSearchCV(pipe_tvec_et,
                          param_grid = pipe_tvec_et_params,
                          cv = 5)
gs_tvec_et.fit(X_train, y_train)
gs_tvec_et_model = gs_tvec_et.best_estimator_
gs_tvec_et.best_params_

{'et__max_depth': None,
 'et__n_estimators': 51,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1)}

In [193]:
gs_tvec_et.best_score_

0.9193114973996988

In [194]:
gs_tvec_et_model.score(X_train, y_train)

0.9985928705440901

In [195]:
gs_tvec_et_model.score(X_test, y_test)

0.9377049180327869

### Score table

In [197]:
scores=pd.DataFrame()

In [199]:
scores['Model Type']=['CountVectorization-LR','CountVectorization-MNB','CountVectorization-BNB','CountVectorization-GNB','CountVectorization-DecisionTree,Bag','CountVectorization-RandomForest','CountVectorization-ExtraTrees','TFIDFVectorization-LR','TFIDFVectorization-MNB','TFIDFVectorization-BNB','TFIDFVectorization-GNB','TFIDFVectorization-DecisionTree,Bag','TFIDFVectorization-RandomForest','TFIDFVectorization-ExtraTrees']
scores['Train Accuracy']=[gs_cvec_lr_model.score(X_train, y_train),gs_cvec_mnb_model.score(X_train,y_train),gs_cvec_gnb_model.score(X_train,y_train),gs_cvec_bnb_model.score(X_train,y_train),gs_cvec_dt_bag_model.score(X_train, y_train),gs_cvec_rf_model.score(X_train,y_train),gs_cvec_et_model.score(X_train,y_train),gs_tvec_lr_model.score(X_train, y_train),gs_tvec_mnb_model.score(X_train,y_train),gs_tvec_gnb_model.score(X_train,y_train),gs_tvec_bnb_model.score(X_train,y_train),gs_tvec_dt_bag_model.score(X_train, y_train),gs_tvec_rf_model.score(X_train,y_train),gs_tvec_et_model.score(X_train,y_train)]
scores['Test Accuracy']=[gs_cvec_lr_model.score(X_test, y_test),gs_cvec_mnb_model.score(X_test, y_test),gs_cvec_gnb_model.score(X_test, y_test),gs_cvec_bnb_model.score(X_test, y_test),gs_cvec_dt_bag_model.score(X_test, y_test),gs_cvec_rf_model.score(X_test, y_test),gs_cvec_et_model.score(X_test, y_test),gs_tvec_lr_model.score(X_test, y_test),gs_tvec_mnb_model.score(X_test, y_test),gs_tvec_gnb_model.score(X_test, y_test),gs_tvec_bnb_model.score(X_test, y_test),gs_tvec_dt_bag_model.score(X_test, y_test),gs_tvec_rf_model.score(X_test,y_test),gs_tvec_et_model.score(X_test,y_test)]
scores['Accuracy Difference']=scores['Train Accuracy']-scores['Test Accuracy']

In [201]:
scores.sort_values(by='Accuracy Difference',ascending=True)

Unnamed: 0,Model Type,Train Accuracy,Test Accuracy,Accuracy Difference
4,"CountVectorization-DecisionTree,Bag",0.934334,0.943169,-0.008835
11,"TFIDFVectorization-DecisionTree,Bag",0.937617,0.939891,-0.002273
3,CountVectorization-GNB,0.94606,0.92459,0.02147
10,TFIDFVectorization-GNB,0.94606,0.92459,0.02147
6,CountVectorization-ExtraTrees,0.999531,0.948634,0.050897
5,CountVectorization-RandomForest,0.999531,0.945355,0.054176
12,TFIDFVectorization-RandomForest,0.998593,0.943169,0.055423
13,TFIDFVectorization-ExtraTrees,0.998593,0.937705,0.060888
1,CountVectorization-MNB,0.977017,0.911475,0.065541
0,CountVectorization-LR,0.999531,0.931148,0.068383


### Save the best model
From the score table above and the scores from Modeling_01.ipynb, Bagged Decision Tree using CountVectorizer is the best model. Here we will create the model using best parameters and save it for an app mapping wildfires.

In [11]:
cvec = CountVectorizer(min_df=1, ngram_range=(1, 2))
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

bc = BaggingClassifier(
    n_estimators=200,
    max_features=.4,
    max_samples=.5,
    base_estimator = DecisionTreeClassifier(
        max_depth=40,
        min_samples_leaf=5,
        min_samples_split=40
    )
    
)

In [12]:
bc.fit(X_train_cvec, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=40,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=5,
                                                        min_samples_split=40,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
     

In [14]:
bc.score(X_train_cvec, y_train)

0.9310506566604128

In [16]:
bc.score(X_test_cvec, y_test)

0.9387978142076503

In [17]:
with open('../model/best_model.pickle', 'wb') as f:
    pickle.dump(bc, f)

In [18]:
with open('../model/vectorizer.pickle', 'wb') as f:
    pickle.dump(cvec, f)