### Loading data and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing, model_selection,feature_extraction

In [2]:
from sklearn.model_selection import RandomizedSearchCV

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nlp_utils.model import train_model

In [6]:
data_folder = 'data/'

In [7]:
train_data = pd.read_csv(data_folder+'train.csv'); print(train_data.shape)
test_data = pd.read_csv(data_folder+'test.csv'); print(test_data.shape)

(7613, 5)
(3263, 4)


In [8]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
train_data[train_data['target'] == 1].sample()['text'].values[0]

'Gov. Brown links CA wildfire to drought http://t.co/jEvrCWUdpm'

In [10]:
# train_data.location.value_counts()

In [11]:
sample_submission = pd.read_csv(data_folder+'sample_submission.csv')
# sample_submission.head()

## Building Features

### TF-IDF

In [160]:
TF_IDF_FEATURES = 5000

In [161]:
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(analyzer = 'word', 
                                                           token_pattern = '(?u)\\b\\w+\\b', 
                                                           max_features=TF_IDF_FEATURES,
                                                          stop_words = 'english')

In [162]:
# tfidf_vectorizer?

In [163]:
tfidf_vectorizer.fit(train_data['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [164]:
tfidf_train_vectors = tfidf_vectorizer.transform(train_data['text'])
tfidf_test_vectors = tfidf_vectorizer.transform(test_data['text'])

In [165]:
# tfidf_vectorizer.vocabulary_

In [166]:
count_train_x, count_valid_x, count_train_y, count_valid_y = train_test_split(tfidf_train_vectors, train_data['target'], 
                                                                              test_size = 0.15, random_state = 44)

### RidgeClassifier

In [167]:

# clf = linear_model.RidgeClassifierCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, 
#                                      scoring='f1', cv=3, class_weight='balanced', store_cv_values=False)

In [168]:
# linear_model.RidgeClassifier?

In [169]:
clf = linear_model.RidgeClassifier(class_weight='balanced', random_state=42, alpha=5)

In [170]:
# clf.fit(count_train_x, count_train_y)

In [171]:
## cross validating
scores = model_selection.cross_val_score(clf, count_train_x, count_train_y, scoring='f1', cv=3)

In [172]:
scores

array([0.73619632, 0.73985134, 0.75558036])

In [173]:
train_model(clf, count_train_x, count_train_y, count_valid_x, count_valid_y, tfidf_test_vectors
#             , 
#             submissions_data=sample_submission, submissions_file_prefix="tfidf_ridge_submissions" 
           )

Classification report : 

              precision    recall  f1-score   support

           1       0.80      0.75      0.77       496
           0       0.82      0.86      0.84       646

   micro avg       0.81      0.81      0.81      1142
   macro avg       0.81      0.80      0.81      1142
weighted avg       0.81      0.81      0.81      1142



### Logistic Regression

In [181]:
np.arange(2.5,3.5,0.1)

array([2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4])

In [197]:
clf_2 = linear_model.LogisticRegressionCV(random_state=42, scoring = 'f1', class_weight='balanced', cv = 3, max_iter=1000, Cs=np.arange(1.5,2.3,0.1))

In [198]:
clf_2.fit(count_train_x, count_train_y)

LogisticRegressionCV(Cs=array([1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2]),
           class_weight='balanced', cv=3, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=1000, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=42, refit=True,
           scoring='f1', solver='lbfgs', tol=0.0001, verbose=0)

In [199]:
clf_2.scores_

{1: array([[0.73521282, 0.73480663, 0.7359116 , 0.73620309, 0.73521282,
         0.73311185, 0.73322241, 0.73200443],
        [0.74321267, 0.74363328, 0.74208145, 0.74392312, 0.74350282,
         0.74266366, 0.743083  , 0.74279254],
        [0.75666667, 0.75693674, 0.75663717, 0.7558011 , 0.75621891,
         0.75538377, 0.75290216, 0.75290216]])}

In [200]:
clf_2.Cs_

array([1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2])

In [201]:
clf_2.C_

array([1.8])

In [202]:
logistic_model = linear_model.LogisticRegression(C = 1.8, random_state=42, max_iter=500, class_weight = 'balanced')

In [203]:
train_model(logistic_model, count_train_x, count_train_y, count_valid_x, count_valid_y, tfidf_test_vectors, 
            submissions_data=sample_submission, submissions_file_prefix="tfidf_logistic_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.80      0.75      0.78       496
           0       0.82      0.86      0.84       646

   micro avg       0.81      0.81      0.81      1142
   macro avg       0.81      0.81      0.81      1142
weighted avg       0.81      0.81      0.81      1142

Exporting data to: 

	 data/tfidf_logistic_submissions_20210212162936.csv




### RF

In [204]:
from sklearn.ensemble import RandomForestClassifier

In [216]:
# RandomForestClassifier?

In [234]:
rf_clf = RandomForestClassifier(1000, class_weight='balanced', oob_score=True, min_samples_split=4, max_depth=250)

In [235]:
train_model(rf_clf, count_train_x, count_train_y, count_valid_x, count_valid_y, tfidf_test_vectors, 
            submissions_data=sample_submission, submissions_file_prefix="tfidf_rf_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.81      0.67      0.73       496
           0       0.77      0.88      0.82       646

   micro avg       0.79      0.79      0.79      1142
   macro avg       0.79      0.77      0.78      1142
weighted avg       0.79      0.79      0.78      1142

Exporting data to: 

	 data/tfidf_rf_submissions_20210212164021.csv


### xgb

In [236]:
import xgboost as xgb

In [237]:
from xgboost.sklearn import XGBClassifier

In [266]:
param_test1 = {
#     'max_depth':range(5,15,2),
#     'min_child_weight':range(1,6,2),
#     'learning_rate':[0.001,0.01,0.1],
    'n_estimators':[1000,1500,2000],
#     'gamma':[i/10.0 for i in range(0,5)]
}

In [267]:
param_test1

{'n_estimators': [1000, 1500, 2000]}

In [268]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [269]:
# RandomizedSearchCV?

In [270]:
xgb_clf = XGBClassifier(njobs = -1, max_depth=5, scale_pos_weight=4342/3271, learning_rate=0.1, gamma = 0.0 )

In [271]:
xgb_rsearch = RandomizedSearchCV(xgb_clf, param_distributions=param_test1, n_iter=4, scoring="f1")

In [272]:
xgb_rsearch.fit(X=count_train_x,y=count_train_y)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, njobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.3274228064811984, seed=None, silent=True,
       subsample=1),
          fit_params=None, iid='warn', n_iter=4, n_jobs=None,
          param_distributions={'n_estimators': [1000, 1500, 2000]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [273]:
xgb_rsearch.best_params_

{'n_estimators': 1000}

In [275]:
xgb_rsearch.cv_results_



{'mean_fit_time': array([26.7244393 , 47.23971923, 56.25947038]),
 'std_fit_time': array([0.74672607, 1.42542947, 0.76991571]),
 'mean_score_time': array([0.06328766, 0.09175078, 0.10331941]),
 'std_score_time': array([0.00818385, 0.00168204, 0.00084929]),
 'param_n_estimators': masked_array(data=[1000, 1500, 2000],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 1000},
  {'n_estimators': 1500},
  {'n_estimators': 2000}],
 'split0_test_score': array([0.71938495, 0.71924986, 0.71428571]),
 'split1_test_score': array([0.72706553, 0.72675737, 0.72376631]),
 'split2_test_score': array([0.71991126, 0.71650055, 0.71246537]),
 'mean_test_score': array([0.72212058, 0.72083593, 0.71683913]),
 'std_test_score': array([0.0035032 , 0.00433492, 0.00495431]),
 'rank_test_score': array([1, 2, 3], dtype=int32),
 'split0_train_score': array([0.92030991, 0.9414688 , 0.95699808]),
 'split1_train_score': array([0.92350799, 0.947163

In [276]:
xgb_rsearch.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, njobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.3274228064811984, seed=None, silent=True,
       subsample=1)

In [277]:
xgb_model = XGBClassifier(n_estimators=1000, max_depth = 5, learning_rate = 0.1, gamma=0 )

In [278]:
train_model(xgb_model, count_train_x, count_train_y, count_valid_x, count_valid_y, tfidf_test_vectors, 
            submissions_data=sample_submission, submissions_file_prefix="tfidf_xgb_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.82      0.67      0.74       496
           0       0.78      0.88      0.83       646

   micro avg       0.79      0.79      0.79      1142
   macro avg       0.80      0.78      0.78      1142
weighted avg       0.80      0.79      0.79      1142

Exporting data to: 

	 data/tfidf_xgb_submissions_20210212173828.csv
