In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.util import ngrams
import re
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
## ADD STOPWORDS
stop = set(list(stop) + ["http","https", "s", "nt", "m"])

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val = train_test_split(train_df.text,train_df.target, test_size=0.33, random_state=42)

In [5]:
tfidf = TfidfVectorizer(analyzer = 'word', stop_words = stop, ngram_range=(1, 3), min_df = 5, sublinear_tf=True)

In [6]:
train_tfidf = tfidf.fit_transform(x_train)
val_tfidf = tfidf.transform(x_val)

In [17]:
joblib.dump(tfidf, 'tfidfvectorizer.pkl')

['tfidfvectorizer.pkl']

In [7]:
from sklearn import linear_model
from sklearn.svm import LinearSVC
from sklearn import model_selection

clf_linsvm = linear_model.SGDClassifier(loss='hinge', random_state = 42, class_weight = 'balanced')
clf_logit = linear_model.SGDClassifier(loss='log', random_state = 42, class_weight = 'balanced')


In [8]:
import pprint
svmscores = model_selection.cross_validate(clf_linsvm, train_tfidf, y_train, cv=3, scoring=("f1", "accuracy","roc_auc"))
print('SVM scores')
pprint.pprint(svmscores)
logitscores = model_selection.cross_validate(clf_logit, train_tfidf, y_train, cv=3, scoring=("f1", "accuracy","roc_auc"))
print('Logitistic Scores')
pprint.pprint(logitscores)

SVM scores
{'fit_time': array([0.65231514, 0.00799727, 0.00900531]),
 'score_time': array([0.00800109, 0.00800347, 0.00699353]),
 'test_accuracy': array([0.7542622 , 0.76411765, 0.74749853]),
 'test_f1': array([0.71832884, 0.71620665, 0.71720501]),
 'test_roc_auc': array([0.81612372, 0.8215396 , 0.80804026]),
 'train_accuracy': array([0.91644601, 0.92058824, 0.91943546]),
 'train_f1': array([0.90320382, 0.90689655, 0.90743243]),
 'train_roc_auc': array([0.96563099, 0.96375922, 0.96793139])}
Logitistic Scores
{'fit_time': array([0.01100039, 0.00699711, 0.00999832]),
 'score_time': array([0.00500083, 0.00600147, 0.00500083]),
 'test_accuracy': array([0.75720165, 0.77647059, 0.76397881]),
 'test_f1': array([0.7241149 , 0.73163842, 0.73032952]),
 'test_roc_auc': array([0.83242067, 0.84242995, 0.82730443]),
 'train_accuracy': array([0.90556046, 0.90764706, 0.90914437]),
 'train_f1': array([0.89188279, 0.89202201, 0.89514761]),
 'train_roc_auc': array([0.96899198, 0.96789933, 0.97023396])}


In [9]:
clf_logistic = linear_model.LogisticRegression(class_weight = 'balanced',random_state = 42)
clf_svm = LinearSVC(class_weight = 'balanced',random_state = 42, dual = False )

In [10]:
svmscore = model_selection.cross_validate(clf_svm, train_tfidf, y_train, cv=3, scoring=("f1", "accuracy","roc_auc"))
print('SVM')
pprint.pprint(svmscore)
logitscore = model_selection.cross_validate(clf_logistic, train_tfidf, y_train, cv=3, scoring=("f1", "accuracy","roc_auc"))
print('LOGIT')
pprint.pprint(logitscore)

SVM
{'fit_time': array([0.1530664 , 0.02700186, 0.02300143]),
 'score_time': array([0.00700092, 0.00500083, 0.00500035]),
 'test_accuracy': array([0.74779541, 0.75529412, 0.75632725]),
 'test_f1': array([0.71072151, 0.70662906, 0.72027027]),
 'test_roc_auc': array([0.81477303, 0.82259984, 0.8137581 ]),
 'train_accuracy': array([0.93821712, 0.93617647, 0.93737136]),
 'train_f1': array([0.92822967, 0.92616536, 0.92677896]),
 'train_roc_auc': array([0.98409372, 0.98352875, 0.98502239])}
LOGIT
{'fit_time': array([0.0180161 , 0.01700377, 0.01700091]),
 'score_time': array([0.00498605, 0.00800133, 0.00500011]),
 'test_accuracy': array([0.77307466, 0.78882353, 0.77339612]),
 'test_f1': array([0.73342541, 0.74593064, 0.73429952]),
 'test_roc_auc': array([0.84163462, 0.85414191, 0.83315215]),
 'train_accuracy': array([0.87878788, 0.87882353, 0.88356366]),
 'train_f1': array([0.85763649, 0.85743945, 0.86307054]),
 'train_roc_auc': array([0.94881295, 0.94533354, 0.95010798])}


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf_dt = DecisionTreeClassifier( max_features='auto',random_state=42,class_weight = 'balanced')
clf_rf = RandomForestClassifier(max_features='auto',random_state=42,class_weight = 'balanced')

In [12]:
svmscore = model_selection.cross_validate(clf_dt, train_tfidf, y_train, cv=3, scoring=("f1", "accuracy","roc_auc"))
print('Decision Trees')
pprint.pprint(svmscore)
logitscore = model_selection.cross_validate(clf_rf, train_tfidf, y_train, cv=3, scoring=("f1", "accuracy","roc_auc"))
print('Random Forest')
pprint.pprint(logitscore)

Decision Trees
{'fit_time': array([0.07400012, 0.04800081, 0.04300404]),
 'score_time': array([0.01598835, 0.01000023, 0.00999951]),
 'test_accuracy': array([0.69782481, 0.69058824, 0.70453208]),
 'test_f1': array([0.63700565, 0.63268156, 0.64295875]),
 'test_roc_auc': array([0.68948254, 0.689693  , 0.70207113]),
 'train_accuracy': array([0.98734922, 0.98676471, 0.98500441]),
 'train_f1': array([0.98536917, 0.98474059, 0.98276445]),
 'train_roc_auc': array([0.9995711 , 0.99957238, 0.99949376])}
Random Forest
{'fit_time': array([0.25603962, 0.28773737, 0.29781413]),
 'score_time': array([0.0660069 , 0.04600549, 0.04499459]),
 'test_accuracy': array([0.74779541, 0.74058824, 0.74749853]),
 'test_f1': array([0.66510539, 0.66155027, 0.66872587]),
 'test_roc_auc': array([0.80572245, 0.79169786, 0.80686776]),
 'train_accuracy': array([0.97146219, 0.97088235, 0.96853867]),
 'train_f1': array([0.96628432, 0.96570835, 0.96311617]),
 'train_roc_auc': array([0.99730122, 0.9967636 , 0.99600239])}


In [13]:
clf_svm.fit(train_tfidf,y_train)

LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [14]:
from sklearn import metrics

y_pred = clf_svm.predict(val_tfidf)

print('Accuracy:' , metrics.accuracy_score(y_val,y_pred))
print('F1 Score:' , metrics.f1_score(y_val,y_pred))
print('AUC:' , metrics.roc_auc_score(y_val,y_pred))


Accuracy: 0.7747711898129725
F1 Score: 0.7367441860465116
AUC: 0.7705116139795526


In [15]:
from sklearn.externals import joblib
joblib.dump(clf_svm, 'model.pkl')

['model.pkl']

In [16]:
model = joblib.load('model.pkl')
y_pred = model.predict(val_tfidf)

print('Accuracy:' , metrics.accuracy_score(y_val,y_pred))
print('F1 Score:' , metrics.f1_score(y_val,y_pred))
print('AUC:' , metrics.roc_auc_score(y_val,y_pred))

Accuracy: 0.7747711898129725
F1 Score: 0.7367441860465116
AUC: 0.7705116139795526


In [18]:
vect = joblib.load('tfidfvectorizer.pkl')
val_tfidf = vect.transform(x_val)
y_pred = model.predict(val_tfidf)

print('Accuracy:' , metrics.accuracy_score(y_val,y_pred))
print('F1 Score:' , metrics.f1_score(y_val,y_pred))
print('AUC:' , metrics.roc_auc_score(y_val,y_pred))

Accuracy: 0.7747711898129725
F1 Score: 0.7367441860465116
AUC: 0.7705116139795526
