In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

nltk.download('stopwords')
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier,ExtraTreesRegressor , RandomForestRegressor
from xgboost import XGBClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\master\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
df = pd.read_csv("../train.csv", delimiter=',')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [15]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [17]:
df['text'][0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [46]:
corpus = []

for i in range(0,7613):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', df['text'][i], flags=re.MULTILINE)
    text = re.sub('[^a-zA-Z0-9]', ' ', df['text'][i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
    text = ' '.join(text)
    corpus.append(text)
    

In [47]:
for i in range(10):
    print(corpus[i])

deed reason earthquak may allah forgiv us
forest fire near la rong sask canada
resid ask shelter place notifi offic evacu shelter place order expect
13 000 peopl receiv wildfir evacu order california
got sent photo rubi alaska smoke wildfir pour school
rockyfir updat california hwi 20 close direct due lake counti fire cafir wildfir
flood disast heavi rain caus flash flood street manit colorado spring area
top hill see fire wood
emerg evacu happen build across street
afraid tornado come area


In [48]:
for i in range(10):
    print(df['text'][i])

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
I'm on top of the hill and I can see a fire in the woods...
There's an emergency evacuation happening now in the building across the street
I'm afraid that the tornado is coming to our area...


In [49]:
cv = CountVectorizer(max_features=2000)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values
len(X[0])

2000

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [51]:
classifiernb = GaussianNB()
classifiernb.fit(X_train, y_train)

GaussianNB()

In [52]:
y_pred = classifiernb.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 0]
 [0 0]
 ...
 [1 0]
 [0 1]
 [0 0]]


In [53]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[762 120]
 [222 419]]


0.7754432042022325

In [54]:
corpus[-2]

'polic investig e bike collid car littl portug e bike rider suffer seriou non life threaten injuri'

In [38]:
filename = 'GaussianNB_modelv2.sav'
pickle.dump(classifiernb, open(filename, 'wb'))

In [55]:
classifierlogreg = LogisticRegression()
classifierlogreg.fit(X_train, y_train)

LogisticRegression()

In [56]:
y_pred = classifierlogreg.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 1]
 [0 0]]


In [57]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[758 124]
 [199 442]]


0.7879185817465528

In [58]:
classifierknn = KNeighborsClassifier()
classifierknn.fit(X_train, y_train)

KNeighborsClassifier()

In [59]:
y_pred = classifierknn.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 1]
 [0 0]]


In [60]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[856  26]
 [407 234]]


0.7156927117531189

In [45]:
filename = 'KNeighborsClassifier_modelv2.sav'
pickle.dump(classifierknn, open(filename, 'wb'))