In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_csv('data/nlp-getting-started/train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
df.keyword.value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [4]:
df.location.value_counts()

USA                          104
New York                      71
United States                 50
London                        45
Canada                        29
                            ... 
Livonia, MI                    1
A small federal enclave        1
amsterdayum 120615 062415      1
Des Moines, Iowa               1
D(M)V                          1
Name: location, Length: 3341, dtype: int64

In [5]:
df.location.replace('United States', 'USA', inplace=True)

In [6]:
df.location.nunique()

3340

In [7]:
# df['location']='USA'
df.loc[df['location'] == 'USA', 'target'].value_counts()

1    94
0    60
Name: target, dtype: int64

In [8]:
df.id.nunique()

7613

In [9]:
df.drop('id', axis=1, inplace=True)

In [10]:
df.target.value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [11]:
df.location.isna().value_counts()

False    5080
True     2533
Name: location, dtype: int64

In [12]:
df.text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [13]:
df.loc[df['keyword'] == 'harm', 'target'].value_counts()

0    37
1     4
Name: target, dtype: int64

In [15]:
df.head()

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,1
1,,,Forest fire near La Ronge Sask. Canada,1
2,,,All residents asked to 'shelter in place' are ...,1
3,,,"13,000 people receive #wildfires evacuation or...",1
4,,,Just got sent this photo from Ruby #Alaska as ...,1


In [17]:
df.location.unique()[1:]


array(['Birmingham', 'Est. September 2012 - Bristol', 'AFRICA', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [18]:
df.drop(columns=['location',
                 'keyword',
                 ], inplace=True)


In [19]:
df

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [20]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import re

corpus = []
ps = PorterStemmer()

for i in range(7613):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word)
              for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

Wall time: 1min 38s


In [21]:
corpus[0]

'deed reason earthquak may allah forgiv us'

In [71]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=2000)
tfidf = TfidfVectorizer(max_features=3000)

In [72]:
x_bog = cv.fit_transform(corpus).toarray()
x_tfidf = tfidf.fit_transform(corpus).toarray()

In [73]:
x_bog.shape, x_tfidf.shape

((7613, 2000), (7613, 3000))

In [25]:
# from sklearn.naive_bayes import GaussianNB,MultinomialNB
# from sklearn.metrics import classification_report,accuracy_score

# gb=GaussianNB()
# mb=MultinomialNB()

In [26]:
# gb.fit(x_bog,df.target.ravel())

In [27]:
# mb.fit(x_bog,df.target.ravel())
# mb.score(x_bog,df.target.ravel())

In [28]:
# gb.score(x_bog,df.target.ravel())

In [29]:
df_test = pd.read_csv('data/nlp-getting-started/test.csv')
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [30]:
df_test.drop(columns=['keyword', 'location'], inplace=True)
test_corpus = []

In [31]:
df_test

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,Storm in RI worse than last hurricane. My city...
3260,10868,Green Line derailment in Chicago http://t.co/U...
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...


In [32]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import re

ps = PorterStemmer()

for i in range(3263):
    review = re.sub('[^a-zA-Z]', ' ', df_test['text'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word)
              for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    test_corpus.append(review)


Wall time: 44.6 s


In [74]:
x_bog_test = cv.transform(test_corpus).toarray()
x_tfidf_test = tfidf.transform(test_corpus).toarray()

In [34]:
# y_pred=mb.predict(x_test)
# y_pred

In [45]:
# output = pd.DataFrame({'id': df_test.id, 'target': y_pred})
# output.to_csv('sample_submission.csv', index=False)


## Try other models


In [46]:
# y=df.target.ravel()
# from sklearn.svm import SVC
# svc=SVC()
# svc.fit(x_bog,y)
# svc.score(x_bog,y)


0.9158019177722317

In [48]:
# y_svc_pred=svc.predict(x_test)
# output =pd.DataFrame({'id':df_test.id,'target':y_svc_pred})
# output.to_csv('sample_submission.csv', index=False)


In [75]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier


In [56]:
# parameters = {'kernel':('linear','sigmoid','poly','rbf'), 'C':[1, 5,10,20]}
# clf = GridSearchCV(svc, parameters)
# clf.fit(x_bog,y)


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})

In [57]:
# clf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_kernel', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [59]:
# clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [35]:
# clf.best_estimator_

In [62]:
# pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,23.857344,0.763693,4.595492,0.182498,1,linear,"{'C': 1, 'kernel': 'linear'}",0.70847,0.597505,0.646093,0.637319,0.744415,0.666761,0.05265,3
1,33.920913,3.568469,10.017005,0.560905,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.726855,0.663165,0.685489,0.698423,0.798949,0.714576,0.046946,1
2,32.251862,2.222124,3.874929,0.375307,10,linear,"{'C': 10, 'kernel': 'linear'}",0.655942,0.593565,0.619173,0.629435,0.675427,0.634708,0.028534,4
3,37.615784,0.970159,10.11727,0.608552,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.709783,0.631648,0.653316,0.670171,0.749671,0.682918,0.04206,2


In [96]:
svc = SVC(C=1, kernel='rbf')
rf = RandomForestClassifier(n_estimators=200, max_depth=100)
xgb = XGBClassifier()
mb = MultinomialNB()
lr = LogisticRegression(max_iter=1000)
clf = VotingClassifier([('svc', svc), ('rf', rf), ('xgb', xgb)])

In [43]:
# rf=RandomForestClassifier(n_estimators=500,n_jobs=-1)
# rf.fit(x_bog,y)
y = df.target.ravel()
xgb.fit(x_bog, y)
xgb.score(x_bog, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [77]:
y = df.target.ravel()
xgb.fit(x_tfidf, y)
xgb.score(x_tfidf, y)





0.861289898857218

In [78]:
y_xgb_pred = xgb.predict(x_tfidf_test)
output = pd.DataFrame({'id': df_test.id, 'target': y_xgb_pred})
output.to_csv('sample_submission.csv', index=False)

In [97]:
clf.fit(x_tfidf, y)





VotingClassifier(estimators=[('svc', SVC(C=1)),
                             ('rf',
                              RandomForestClassifier(max_depth=100,
                                                     n_estimators=200)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None

In [98]:
clf.score(x_tfidf, y)

0.9146197294102194

In [99]:
y_best = clf.predict(x_tfidf_test)
output = pd.DataFrame({'id': df_test.id, 'target': y_best})
output.to_csv('sample_submission.csv', index=False)

In [94]:
mb.fit(x_bog, y)

MultinomialNB()

In [95]:
mb.score(x_bog, y)

0.8295021673453303