In [58]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,roc_auc_score,f1_score

In [2]:
train= pd.read_csv('./nlp-getting-started/train.csv')
test=pd.read_csv('./nlp-getting-started/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
print(train.shape)
train.isnull().sum()
train.columns
for col in ['keyword', 'location','target']:
    print("{} column nunique {}".format(col,train[col].nunique()))
    print("{} column val counts {}".format(col,(train[col].value_counts(normalize=True,dropna=False)*100).head(7).to_dict()))

(7613, 5)
keyword column nunique 221
keyword column val counts {nan: 0.8012610009194797, 'fatalities': 0.5910941810061737, 'deluge': 0.5516879022724287, 'armageddon': 0.5516879022724287, 'body%20bags': 0.5385524760278472, 'harm': 0.5385524760278472, 'sinking': 0.5385524760278472}
location column nunique 3341
location column val counts {nan: 33.27203467752528, 'USA': 1.3660843294364904, 'New York': 0.9326152633652962, 'United States': 0.6567713122290818, 'London': 0.5910941810061737, 'Canada': 0.38092736109286746, 'Nigeria': 0.3677919348482858}
target column nunique 2
target column val counts {0: 57.03402075397347, 1: 42.96597924602653}


In [17]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, train.target.values, 
                                                  stratify=train.target.values, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
xtest=test.text.values

print(xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape,xtest.shape)

(6851,) (762,) (6851,) (762,) (3263,)


In [18]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')


tfv.fit(list(xtrain) + list(xvalid)+ list(xtest))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
xtest_tfv=tfv.transform(xtest)

In [39]:
for clf in [LogisticRegression(C=1.0),RandomForestClassifier()]:
# clf = LogisticRegression(C=1.0)
    clf.fit(xtrain_tfv, ytrain)
    predictions = clf.predict_proba(xvalid_tfv)
    # print(classification_report(predictions,yvalid))
    score=roc_auc_score(yvalid,predictions[:,1])
    print("{} score: {}".format(clf.__class__.__name__,score))



LogisticRegression score: 0.8700411262258779
RandomForestClassifier score: 0.8183556539772928


In [67]:
clf=LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))
# test["predict_lr_tfidf1"]=
thresholds=np.arange(0.1,1,0.05)
for threshold in thresholds:
    temp=predictions[:,1].copy()
    temp[temp>=threshold]=1
    temp[temp<threshold]=0
    score=f1_score(yvalid,temp)
    print("threshold {} score: {}".format(threshold,score))
    

LogisticRegression score: 0.8700411262258779
threshold 0.1 score: 0.6038781163434903
threshold 0.15000000000000002 score: 0.6143667296786389
threshold 0.20000000000000004 score: 0.6510204081632653
threshold 0.25000000000000006 score: 0.6814159292035399
threshold 0.30000000000000004 score: 0.7139423076923076
threshold 0.3500000000000001 score: 0.75
threshold 0.40000000000000013 score: 0.7576197387518142
threshold 0.45000000000000007 score: 0.7774294670846396
threshold 0.5000000000000001 score: 0.7529411764705881
threshold 0.5500000000000002 score: 0.7289048473967685
threshold 0.6000000000000002 score: 0.6994328922495274
threshold 0.6500000000000001 score: 0.6345381526104418
threshold 0.7000000000000002 score: 0.5219298245614036
threshold 0.7500000000000002 score: 0.43294117647058816
threshold 0.8000000000000002 score: 0.31632653061224486
threshold 0.8500000000000002 score: 0.20821917808219179
threshold 0.9000000000000002 score: 0.08211143695014662
threshold 0.9500000000000003 score: 0.0

  'precision', 'predicted', average, warn_for)


In [75]:
threshold=0.5
testpred=clf.predict_proba(xtest_tfv)
testpred=testpred[:,1].copy()
testpred[testpred>=threshold]=1
testpred[testpred<threshold]=0
test["pred1_tfidf_lr"]=testpred
test["pred1_tfidf_lr"]=test["pred1_tfidf_lr"].astype(int)
test[["id","pred1_tfidf_lr"]].rename(columns={"pred1_tfidf_lr":"target"}).to_csv("./submissions/submission_1.csv",index=None)
test.head()

Unnamed: 0,id,keyword,location,text,pred1_tfidf_lr
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",0
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [78]:
# Trying same with count vevtorizer
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')


ctv.fit(list(xtrain) + list(xvalid)+ list(xtest))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)
xtest_ctv=ctv.transform(xtest)
for clf in [LogisticRegression(C=1.0),RandomForestClassifier()]:
# clf = LogisticRegression(C=1.0)
    clf.fit(xtrain_ctv, ytrain)
    predictions = clf.predict_proba(xvalid_ctv)
    # print(classification_report(predictions,yvalid))
    score=roc_auc_score(yvalid,predictions[:,1])
    print("{} score: {}".format(clf.__class__.__name__,score))



LogisticRegression score: 0.8612815916200921




RandomForestClassifier score: 0.8416991809905445


In [79]:
clf=LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))
# test["predict_lr_tfidf1"]=
thresholds=np.arange(0.1,1,0.05)
for threshold in thresholds:
    temp=predictions[:,1].copy()
    temp[temp>=threshold]=1
    temp[temp<threshold]=0
    score=f1_score(yvalid,temp)
    print("threshold {} score: {}".format(threshold,score))



LogisticRegression score: 0.8612815916200921
threshold 0.1 score: 0.6702470461868958
threshold 0.15000000000000002 score: 0.7058823529411764
threshold 0.20000000000000004 score: 0.7302798982188294
threshold 0.25000000000000006 score: 0.7381275440976934
threshold 0.30000000000000004 score: 0.757532281205165
threshold 0.3500000000000001 score: 0.7671641791044775
threshold 0.40000000000000013 score: 0.7580893682588598
threshold 0.45000000000000007 score: 0.7548387096774194
threshold 0.5000000000000001 score: 0.7373737373737373
threshold 0.5500000000000002 score: 0.7247386759581883
threshold 0.6000000000000002 score: 0.7198581560283689
threshold 0.6500000000000001 score: 0.7150635208711434
threshold 0.7000000000000002 score: 0.6902985074626865
threshold 0.7500000000000002 score: 0.6718146718146718
threshold 0.8000000000000002 score: 0.6234817813765183
threshold 0.8500000000000002 score: 0.5970772442588727
threshold 0.9000000000000002 score: 0.5122494432071268
threshold 0.9500000000000003 s

In [80]:
threshold=0.35
testpred=clf.predict_proba(xtest_ctv)
testpred=testpred[:,1].copy()
testpred[testpred>=threshold]=1
testpred[testpred<threshold]=0
test["pred1_ctv_lr"]=testpred
test["pred1_ctv_lr"]=test["pred1_ctv_lr"].astype(int)
test[["id","pred1_ctv_lr"]].rename(columns={"pred1_ctv_lr":"target"}).to_csv("./submissions/submission_2.csv",index=None)
test.head()

Unnamed: 0,id,keyword,location,text,pred1_tfidf_lr,pred1_ctv_lr
0,0,,,Just happened a terrible car crash,1,1
1,2,,,"Heard about #earthquake is different cities, s...",0,1
2,3,,,"there is a forest fire at spot pond, geese are...",1,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1,1


In [83]:
from sklearn.metrics import accuracy_score, log_loss,roc_curve,auc,roc_auc_score,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# !pip install xgboost
import xgboost as xgb


In [92]:
classifiers = [
#     LogisticRegression(),
#     KNeighborsClassifier(3),
#     KNeighborsClassifier(9),
#     KNeighborsClassifier(27),
#     KNeighborsClassifier(51),
#     DecisionTreeClassifier(),
#     RandomForestClassifier(),
#     AdaBoostClassifier(),
#     GaussianNB(),
#     LinearDiscriminantAnalysis(),
#     QuadraticDiscriminantAnalysis(),
    xgb.XGBClassifier(objective="binary:logistic", random_state=42),
]
for clf in classifiers:
    scores=cross_val_score(clf, xtrain_tfv, ytrain, cv=5)
    print("TFIDF vectorizer{} mean score {} std {}".format(clf.__class__.__name__,np.mean(scores),np.std(scores)))
    scores=cross_val_score(clf, xtrain_ctv, ytrain, cv=5)
    print("Count vectorizer{} mean score {} std {}".format(clf.__class__.__name__,np.mean(scores),np.std(scores)))

TFIDF vectorizerXGBClassifier mean score 0.7252999295160576 std 0.012088962861161096
Count vectorizerXGBClassifier mean score 0.7222360447061843 std 0.021705400426741833


In [93]:
(xtrain_tfv.shape)
# list(xtrain_tfv)+list(xvalid_tfv)

(6851, 12740)

In [94]:
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)
xtest_svd = svd.transform(xtest_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)
xtest_svd_scl = scl.transform(xtest_svd)

In [95]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))

SVC score: 0.8215438152483392


In [96]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))

XGBClassifier score: 0.8466940841505852


In [100]:
# Fitting a simple xgboost on cv
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))


XGBClassifier score: 0.8602481633800837


In [98]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))

XGBClassifier score: 0.8258146156279659


In [99]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))

XGBClassifier score: 0.8191992688670955


In [105]:
# Revisitng best model above== .86
# Fitting a simple xgboost on cv
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))
thresholds=np.arange(0.1,1,0.05)
for threshold in thresholds:
    temp=predictions[:,1].copy()
    temp[temp>=threshold]=1
    temp[temp<threshold]=0
    score=f1_score(yvalid,temp)
    print("threshold {} score: {}".format(threshold,score))
threshold=0.4
testpred=clf.predict_proba(xtest_ctv.tocsc())
testpred=testpred[:,1].copy()
testpred[testpred>=threshold]=1
testpred[testpred<threshold]=0
test["pred1_ctv_xg"]=testpred
test["pred1_ctv_xg"]=test["pred1_ctv_xg"].astype(int)
test[["id","pred1_ctv_xg"]].rename(columns={"pred1_ctv_xg":"target"}).to_csv("./submissions/submission_3.csv",index=None)
test.head()

Unnamed: 0,id,keyword,location,text,pred1_tfidf_lr,pred1_ctv_lr,pred1_ctv_xg
0,0,,,Just happened a terrible car crash,1,1,0
1,2,,,"Heard about #earthquake is different cities, s...",0,1,1
2,3,,,"there is a forest fire at spot pond, geese are...",1,1,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1,0,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1,1,1


In [106]:
# Grid search

In [109]:
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}
# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc',
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:    9.4s remaining:    9.4s
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:   11.2s remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:   11.9s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:   13.8s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   14.7s finished


Best score: 0.820
Best parameters set:
	lr__C: 10
	lr__penalty: 'l2'
	svd__n_components: 180


In [111]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc',
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.838
Best parameters set:
	nb__alpha: 1


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0479s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.0s finished


In [113]:
clf = model.best_estimator_
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
score=roc_auc_score(yvalid,predictions[:,1])
print("{} score: {}".format(clf.__class__.__name__,score))
thresholds=np.arange(0.1,1,0.05)
for threshold in thresholds:
    temp=predictions[:,1].copy()
    temp[temp>=threshold]=1
    temp[temp<threshold]=0
    score=f1_score(yvalid,temp)
    print("threshold {} score: {}".format(threshold,score))
threshold=0.3
testpred=clf.predict_proba(xtest_tfv)
testpred=testpred[:,1].copy()
testpred[testpred>=threshold]=1
testpred[testpred<threshold]=0
test["pred1_tfv_mnb"]=testpred
test["pred1_tfv_mnb"]=test["pred1_tfv_mnb"].astype(int)
test[["id","pred1_tfv_mnb"]].rename(columns={"pred1_tfv_mnb":"target"}).to_csv("./submissions/submission_4.csv",index=None)
test.head()

Pipeline score: 0.8585539034763963
threshold 0.1 score: 0.6352705410821644
threshold 0.15000000000000002 score: 0.6695938529088913
threshold 0.20000000000000004 score: 0.7091346153846153
threshold 0.25000000000000006 score: 0.7303664921465969
threshold 0.30000000000000004 score: 0.7613636363636362
threshold 0.3500000000000001 score: 0.7617602427921092
threshold 0.40000000000000013 score: 0.7472178060413355
threshold 0.45000000000000007 score: 0.734006734006734
threshold 0.5000000000000001 score: 0.7155322862129145
threshold 0.5500000000000002 score: 0.707182320441989
threshold 0.6000000000000002 score: 0.6969696969696969
threshold 0.6500000000000001 score: 0.6547619047619047
threshold 0.7000000000000002 score: 0.6229508196721312
threshold 0.7500000000000002 score: 0.5622317596566524
threshold 0.8000000000000002 score: 0.52
threshold 0.8500000000000002 score: 0.4522144522144522
threshold 0.9000000000000002 score: 0.36543209876543203
threshold 0.9500000000000003 score: 0.3084832904884318

Unnamed: 0,id,keyword,location,text,pred1_tfidf_lr,pred1_ctv_lr,pred1_ctv_xg,pred1_tfv_mnb
0,0,,,Just happened a terrible car crash,1,1,0,1
1,2,,,"Heard about #earthquake is different cities, s...",0,1,1,1
2,3,,,"there is a forest fire at spot pond, geese are...",1,1,1,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1,0,0,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1,1,1,1
