### Loading data and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing, model_selection,feature_extraction

In [2]:
from sklearn.model_selection import RandomizedSearchCV

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nlp_utils.model import train_model

In [6]:
data_folder = 'data/'

In [7]:
## using the cleaned files
train_data = pd.read_csv(data_folder+'train_clean.csv'); print(train_data.shape)
test_data = pd.read_csv(data_folder+'test_clean.csv'); print(test_data.shape)

(7613, 6)
(3263, 5)


In [8]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,text_clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order cali...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


In [9]:
train_data[train_data['target'] == 1].sample()['text'].values[0]

'.@unsuckdcmetro Is the train half-derailed or half-railed? #deepthoughts'

In [10]:
# train_data.location.value_counts()

In [11]:
sample_submission = pd.read_csv(data_folder+'sample_submission.csv')
# sample_submission.head()

## Building Features

### count vectors

In [12]:
features = None

In [13]:
count_vectorizer = feature_extraction.text.CountVectorizer(max_features=features)

In [14]:
# count_vectorizer.get_feature_names()

In [15]:
train_vectors = count_vectorizer.fit_transform(train_data['text_clean'])
test_vectors = count_vectorizer.transform(test_data['text_clean'])

In [16]:
train_vectors.shape

(7613, 15670)

In [17]:
count_train_x, count_valid_x, count_train_y, count_valid_y = train_test_split(train_vectors, train_data['target'], 
                                                                              test_size = 0.15, random_state = 44)

### RidgeClassifier

In [146]:
# cv_clf = linear_model.RidgeClassifierCV?

In [147]:
cv_clf = linear_model.RidgeClassifierCV

In [148]:
clf = linear_model.RidgeClassifier()

In [149]:
## cross validating
scores = model_selection.cross_val_score(clf, count_train_x, count_train_y, scoring='f1', cv=3)

In [150]:
scores

array([0.70233352, 0.7090379 , 0.72380952])

In [151]:
train_model(clf, count_train_x, count_train_y, count_valid_x, count_valid_y, test_vectors,
#             submissions_data=sample_submission, submissions_file_prefix="ridge_submissions"
           )

Classification report : 

              precision    recall  f1-score   support

           1       0.79      0.69      0.73       496
           0       0.78      0.86      0.82       646

   micro avg       0.78      0.78      0.78      1142
   macro avg       0.78      0.77      0.78      1142
weighted avg       0.78      0.78      0.78      1142



### Logistic Regression

In [152]:
# linear_model.LogisticRegressionCV?

In [153]:
clf_2 = linear_model.LogisticRegressionCV(Cs=np.arange(0.05,0.5, 0.05),
                                          random_state=42, scoring = 'f1', class_weight='balanced', cv = 3, max_iter=300)

In [154]:
clf_2.fit(train_vectors, train_data['target'])

LogisticRegressionCV(Cs=array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45]),
           class_weight='balanced', cv=3, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=300, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=42, refit=True,
           scoring='f1', solver='lbfgs', tol=0.0001, verbose=0)

In [155]:
clf_2.scores_

{1: array([[0.64793388, 0.65217391, 0.65332612, 0.6537007 , 0.65301724,
         0.6483871 , 0.64775161, 0.64343164, 0.64239829],
        [0.58854719, 0.5947068 , 0.5998978 , 0.60131379, 0.59899497,
         0.59680639, 0.59432554, 0.59534423, 0.59545005],
        [0.6724846 , 0.67414584, 0.67882472, 0.67806841, 0.67270896,
         0.67566217, 0.67630923, 0.67363184, 0.67293419]])}

In [156]:
clf_2.Cs_

array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45])

In [157]:
clf_2.C_

array([0.2])

In [158]:
# linear_model.LogisticRegression?

In [188]:
logistic_model = linear_model.LogisticRegression(C = 0.2,random_state=42, max_iter=500, class_weight = 'balanced'
                                                )

In [189]:
train_model(logistic_model, 
#             train_vectors, train_data['target'],train_vectors, train_data['target'],
            count_train_x, count_train_y, count_valid_x, count_valid_y,
#             test_vectors, submissions_data=sample_submission, submissions_file_prefix="clean_logistic_submissions"  
           )

Classification report : 

              precision    recall  f1-score   support

           1       0.81      0.77      0.79       496
           0       0.83      0.86      0.84       646

   micro avg       0.82      0.82      0.82      1142
   macro avg       0.82      0.81      0.81      1142
weighted avg       0.82      0.82      0.82      1142





### RF

In [161]:
from sklearn.ensemble import RandomForestClassifier

In [162]:
# RandomForestClassifier?

In [163]:
rf_clf = RandomForestClassifier(300, class_weight='balanced', oob_score=True,min_samples_split = 3)

In [164]:
train_model(rf_clf, count_train_x, count_train_y, count_valid_x, count_valid_y, 
#             test_vectors, 
#             submissions_data=sample_submission, submissions_file_prefix="rf_submissions"  
           )

Classification report : 

              precision    recall  f1-score   support

           1       0.82      0.69      0.75       496
           0       0.79      0.88      0.83       646

   micro avg       0.80      0.80      0.80      1142
   macro avg       0.80      0.78      0.79      1142
weighted avg       0.80      0.80      0.79      1142



### xgb

In [165]:
import xgboost as xgb

In [166]:
from xgboost.sklearn import XGBClassifier

In [167]:
param_test1 = {
    'max_depth':range(3,10,2),
#     'min_child_weight':range(1,6,2),
    'learning_rate':[0.001,0.01,0.1],
    'n_estimators':[100,250,500],
    'gamma':[i/10.0 for i in range(0,5)]
}

In [168]:
# xgb.train?

In [169]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [170]:
# RandomizedSearchCV?

In [171]:
xgb_clf = XGBClassifier(njobs = -1,scale_pos_weight=4342/3271)

In [172]:
xgb_rsearch = RandomizedSearchCV(xgb_clf, param_distributions=param_test1, n_iter=20, scoring="f1")

In [173]:
xgb_rsearch.fit(X=count_train_x,y=count_train_y)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, njobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.3274228064811984, seed=None, silent=True,
       subsample=1),
          fit_params=None, iid='warn', n_iter=20, n_jobs=None,
          param_distributions={'max_depth': range(3, 10, 2), 'learning_rate': [0.001, 0.01, 0.1], 'n_estimators': [100, 250, 500], 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [174]:
xgb_params = xgb_rsearch.best_params_

In [175]:
xgb_params

{'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0.1}

In [184]:
xgb_model = XGBClassifier(n_estimators=500, max_depth=9, learning_rate=0.1, gamma=0.1, njobs = -1, scale_pos_weight=4342/3271)

In [185]:
train_model(xgb_model, 
#             train_vectors, train_data['target'],train_vectors, train_data['target'],
            count_train_x, count_train_y, count_valid_x, count_valid_y,
#             test_vectors, submissions_data=sample_submission, submissions_file_prefix="clean_xgb_submissions"  
           )

Classification report : 

              precision    recall  f1-score   support

           1       0.81      0.74      0.77       496
           0       0.81      0.87      0.84       646

   micro avg       0.81      0.81      0.81      1142
   macro avg       0.81      0.80      0.81      1142
weighted avg       0.81      0.81      0.81      1142



### Neural Network

In [18]:
import tensorflow.keras as keras

In [19]:
from tensorflow.keras import layers, optimizers, models

In [47]:
neg = count_train_y.value_counts()[0]
pos = count_train_y.value_counts()[1]
initial_bias = np.log([pos/neg])
print("initial_bias: {}".format(initial_bias))

initial_bias: [-0.28660041]


In [20]:
# optimizers.Adam?

In [50]:
# layers.Dense?

In [99]:
def create_nn(input_size, output_bias = None):
    if output_bias is not None:
        output_bias = keras.initializers.Constant(output_bias)
    
    input_layer = layers.Input((input_size,))
    hidden_layer = layers.Dense(512, activation = 'sigmoid')(input_layer)
    hidden_layer = layers.Dropout(0.35)(hidden_layer)
    hidden_layer = layers.Dense(256, activation = 'sigmoid')(hidden_layer)
    hidden_layer = layers.Dropout(0.35)(hidden_layer)
    hidden_layer = layers.Dense(128, activation = 'sigmoid')(hidden_layer)
    hidden_layer = layers.Dropout(0.35)(hidden_layer)
    hidden_layer = layers.Dense(64, activation = 'sigmoid')(hidden_layer)
    hidden_layer = layers.Dropout(0.35)(hidden_layer)
    hidden_layer = layers.Dense(32, activation = 'sigmoid')(hidden_layer)
    hidden_layer = layers.Dropout(0.35)(hidden_layer)
    hidden_layer = layers.Dense(16, activation = 'sigmoid')(hidden_layer)
#     hidden_layer = layers.Dropout(0.25)(hidden_layer)
    output_layer = layers.Dense(1, activation = 'sigmoid',bias_initializer=output_bias )(hidden_layer)
    
    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(0.0005), loss = 'binary_crossentropy')
    
    return classifier

In [100]:
classifier = create_nn(train_vectors.shape[1], output_bias=initial_bias)

In [101]:
classifier.summary()

Model: "model_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        [(None, 15670)]           0         
_________________________________________________________________
dense_86 (Dense)             (None, 512)               8023552   
_________________________________________________________________
dropout_56 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_87 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_57 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_88 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_58 (Dropout)         (None, 128)               0  

In [102]:
train_model(classifier,
#             train_vectors.toarray(), train_data['target'],train_vectors.toarray(), train_data['target'],
            count_train_x.toarray(), count_train_y, count_valid_x.toarray(), count_valid_y,
            test_vectors = test_vectors, neural_network = True, epochs = 50,
#                 submissions_data = sample_submission, submissions_file_prefix="clean_nn_submissions" 
           )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Classification report : 

              precision    recall  f1-score   support

           1       0.78      0.76      0.77       496
           0       0.82      0.84      0.83       646

   micro avg       0.80      0.80      0.80      1142
   macro avg       0.80      0.80      0.80      1142
weighted avg       0.80      0.80      0.80      1142



In [59]:
# classifier.fit?

In [72]:
from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='val_loss', patience=5)

In [73]:
classifier.fit(count_train_x.toarray(), count_train_y,epochs=50, validation_data=(count_valid_x.toarray(), count_valid_y), callbacks=callback)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50


<tensorflow.python.keras.callbacks.History at 0x7fcb05362510>

In [47]:
classifier.predict(count_valid_x.toarray())

array([[0.04936668],
       [0.956838  ],
       [0.03449565],
       ...,
       [0.04843271],
       [0.05063567],
       [0.98669565]], dtype=float32)

In [48]:
classifier.predict(count_valid_x.toarray()).argmax(axis = -1)

array([0, 0, 0, ..., 0, 0, 0])

In [52]:
np.where(classifier.predict(count_valid_x.toarray())>0.2,1,0).sum()#.argmax(axis=-1).sum()

526

In [88]:
# classifier.predict?