### Loading data and libraries

In [105]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing, model_selection,feature_extraction

In [106]:
from sklearn.model_selection import RandomizedSearchCV

In [107]:
import matplotlib.pyplot as plt

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
from nlp_utils.model import train_model

In [110]:
data_folder = 'data/'

In [111]:
train_data = pd.read_csv(data_folder+'train.csv'); print(train_data.shape)
test_data = pd.read_csv(data_folder+'test.csv'); print(test_data.shape)

(7613, 5)
(3263, 4)


In [112]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [113]:
train_data[train_data['target'] == 1].sample()['text'].values[0]

':StarMade: :Stardate 3: :Planetary Annihilation:: http://t.co/I2hHvIUmTm via @YouTube'

In [114]:
# train_data.location.value_counts()

In [115]:
sample_submission = pd.read_csv(data_folder+'sample_submission.csv')
# sample_submission.head()

## Building Features

### count vectors

In [13]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [14]:
# count_vectorizer.get_feature_names()

In [15]:
train_vectors = count_vectorizer.fit_transform(train_data['text'])
test_vectors = count_vectorizer.transform(test_data['text'])

In [16]:
train_vectors.shape

(7613, 21637)

In [17]:
count_train_x, count_valid_x, count_train_y, count_valid_y = train_test_split(train_vectors, train_data['target'], 
                                                                              test_size = 0.15, random_state = 44)

### RidgeClassifier

In [21]:
clf = linear_model.RidgeClassifier()

In [19]:
## cross validating
scores = model_selection.cross_val_score(clf, count_train_x, count_train_y, scoring='f1', cv=3)

In [20]:
scores

array([0.71242201, 0.71147161, 0.71655329])

In [30]:
train_model(clf, count_train_x, count_train_y, count_valid_x, count_valid_y, test_vectors, submissions_data=sample_submission, submissions_file_prefix="ridge_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.80      0.71      0.75       496
           0       0.80      0.87      0.83       646

   micro avg       0.80      0.80      0.80      1142
   macro avg       0.80      0.79      0.79      1142
weighted avg       0.80      0.80      0.80      1142

Exporting data to: 

	 nlp-getting-started/ridge_submissions_20210208125514.csv


### Logistic Regression

In [2]:
clf_2 = linear_model.LogisticRegressionCV(Cs=np.arange(0.2,0.4, 0.01), random_state=42, scoring = 'f1', class_weight='balanced', cv = 3, max_iter=300)

NameError: name 'linear_model' is not defined

In [1]:
clf_2.fit(count_train_x, count_train_y)

NameError: name 'clf_2' is not defined

In [119]:
clf_2.scores_

{1: array([[0.74653356, 0.74611973, 0.74709463, 0.74611973, 0.74529347,
         0.74488102, 0.74516307, 0.74516307, 0.74254144, 0.7426674 ,
         0.7434903 , 0.74390244, 0.74459834, 0.74459834, 0.74279379,
         0.74279379, 0.74238227, 0.7401662 , 0.73987798, 0.73847862],
        [0.739819  , 0.73839185, 0.74036281, 0.74036281, 0.74078276,
         0.74078276, 0.7414966 , 0.74191719, 0.74204545, 0.7413303 ,
         0.7413303 , 0.74103586, 0.74145786, 0.74188034, 0.74145786,
         0.74145786, 0.74116306, 0.74116306, 0.74158585, 0.74158585],
        [0.74680022, 0.7458194 , 0.74678591, 0.74650252, 0.74468085,
         0.74608501, 0.74566797, 0.74706868, 0.74651811, 0.74680022,
         0.74638487, 0.74568726, 0.74568726, 0.74680022, 0.74638487,
         0.74485825, 0.74485825, 0.74457429, 0.74485825, 0.74527253]])}

In [120]:
clf_2.Cs_

array([0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 ,
       0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39])

In [121]:
logistic_model = linear_model.LogisticRegression(C = 0.3,random_state=42, max_iter=500, class_weight = 'balanced')

In [122]:
train_model(logistic_model, count_train_x, count_train_y, count_valid_x, count_valid_y, test_vectors, 
            submissions_data=sample_submission, submissions_file_prefix="logistic_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.80      0.76      0.78       496
           0       0.82      0.85      0.84       646

   micro avg       0.81      0.81      0.81      1142
   macro avg       0.81      0.81      0.81      1142
weighted avg       0.81      0.81      0.81      1142

Exporting data to: 

	 data/logistic_submissions_20210210155510.csv




### RF

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
# RandomForestClassifier?

In [64]:
rf_clf = RandomForestClassifier(300, class_weight='balanced', oob_score=True,min_samples_split = 3)

In [65]:
train_model(rf_clf, count_train_x, count_train_y, count_valid_x, count_valid_y, test_vectors, 
            submissions_data=sample_submission, submissions_file_prefix="rf_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.86      0.64      0.74       496
           0       0.77      0.92      0.84       646

   micro avg       0.80      0.80      0.80      1142
   macro avg       0.82      0.78      0.79      1142
weighted avg       0.81      0.80      0.79      1142

Exporting data to: 

	 nlp-getting-started/rf_submissions_20210210103200.csv


### xgb

In [66]:
import xgboost as xgb

In [68]:
from xgboost.sklearn import XGBClassifier

In [74]:
param_test1 = {
    'max_depth':range(3,10,2),
#     'min_child_weight':range(1,6,2),
    'learning_rate':[0.001,0.01,0.1],
    'n_estimators':[100,250,500],
    'gamma':[i/10.0 for i in range(0,5)]
}

In [84]:
# xgb.train?

In [88]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [97]:
# RandomizedSearchCV?

In [98]:
xgb_clf = XGBClassifier(njobs = -1,scale_pos_weight=4342/3271)

In [94]:
xgb_rsearch = RandomizedSearchCV(xgb_clf, param_distributions=param_test1, n_iter=20, scoring="f1")

In [95]:
xgb_rsearch.fit(X=count_train_x,y=count_train_y)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, njobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.3274228064811984, seed=None, silent=True,
       subsample=1),
          fit_params=None, iid='warn', n_iter=20, n_jobs=None,
          param_distributions={'max_depth': range(3, 10, 2), 'learning_rate': [0.001, 0.01, 0.1], 'n_estimators': [100, 250, 500], 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [101]:
xgb_params = xgb_rsearch.best_params_

In [102]:
xgb_params

{'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.0}

In [103]:
xgb_model = XGBClassifier(n_estimators=500, max_depth=7, learning_rate=0.1, gamma=0.0, njobs = -1, scale_pos_weight=4342/3271)

In [116]:
train_model(xgb_model, count_train_x, count_train_y, count_valid_x, count_valid_y, test_vectors, 
            submissions_data=sample_submission, submissions_file_prefix="xgb_submissions"  )

Classification report : 

              precision    recall  f1-score   support

           1       0.82      0.77      0.79       496
           0       0.83      0.87      0.85       646

   micro avg       0.83      0.83      0.83      1142
   macro avg       0.83      0.82      0.82      1142
weighted avg       0.83      0.83      0.83      1142

Exporting data to: 

	 data/xgb_submissions_20210210153427.csv
