### Import and read data

In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df_train = pd.read_csv("train.csv")
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df_train.drop(['id','keyword','location'], axis = 1, inplace=True)

In [7]:
#Average Tweet Length
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

In [8]:
df_train['avg_word'] = df_train['text'].apply(lambda x: avg_word(x))

In [12]:
#Disaster Tweets

In [11]:
Disaster = df_train[df_train.target==1]
Disaster.head()

Unnamed: 0,text,target,avg_word
0,Our Deeds are the Reason of this #earthquake M...,1,4.384615
1,Forest fire near La Ronge Sask. Canada,1,4.571429
2,All residents asked to 'shelter in place' are ...,1,5.090909
3,"13,000 people receive #wildfires evacuation or...",1,7.125
4,Just got sent this photo from Ruby #Alaska as ...,1,4.5


In [13]:
#Non-Disaster Tweets

In [14]:
Non_Disaster = df_train[df_train.target==0]
Non_Disaster.head()

Unnamed: 0,text,target,avg_word
15,What's up man?,0,4.0
16,I love fruits,0,3.666667
17,Summer is lovely,0,4.666667
18,My car is so fast,0,2.6
19,What a goooooooaaaaaal!!!!!!,0,8.666667


In [15]:
#Class Distribution
classes = df_train.loc[:,'target']
print(classes.value_counts())

0    4342
1    3271
Name: target, dtype: int64


### Preprocessing

In [16]:
#Lowercase
df_train['text'] = df_train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_train['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

In [17]:
#Special Characters Removal
df_train['text']= df_train['text'].str.replace('rt ',"").str.replace('@','').str.replace('#','').str.replace('[^\w\s]','').str.replace('[1-9]','')
df_train['text'].head()

  df_train['text']= df_train['text'].str.replace('rt ',"").str.replace('@','').str.replace('#','').str.replace('[^\w\s]','').str.replace('[1-9]','')


0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    000 people receive wildfires evacuation orders...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [18]:
#Removal of Numbers
df_train['text'] = df_train['text'].str.replace(r'\d+(\.\d+)?','')
df_train['text'].head()

  df_train['text'] = df_train['text'].str.replace(r'\d+(\.\d+)?','')


0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [19]:
# Removing Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df_train['text'] = df_train['text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
df_train['text'].head()

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
Name: text, dtype: object

In [20]:
#Stemming
from nltk.stem import PorterStemmer

st = PorterStemmer()
df_train['text']=df_train['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df_train['text'].head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3          peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text, dtype: object

In [21]:
#Converta Collection of Text Documents to a Matrix of Token Counts
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(df_train.text).toarray()
y = df_train.iloc[:, 1].values

In [22]:
print(X)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]]
[1 1 1 ... 1 1 1]


### Modeling

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [26]:
#Define Models to Train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

In [27]:
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

In [28]:
#Comparing Accuracy of Different Models
for name, model in models:
    nltk_model = model
    nltk_model.fit(X_train,y_train)
    accuracy = nltk_model.score(X_test, y_test)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 72.42284963887064
Decision Tree Accuracy: 74.65528562048588
Random Forest Accuracy: 77.67564018384768
Logistic Regression Accuracy: 79.05449770190414
SGD Classifier Accuracy: 77.34734077478662
Naive Bayes Accuracy: 77.8726198292843
SVM Linear Accuracy: 77.67564018384768


In [29]:
#Selected Model
selected_classifier =  LogisticRegression()
selected_classifier.fit(X_train, y_train)

LogisticRegression()

In [30]:
prediction = selected_classifier.predict(X_test)

In [32]:
#Print Classification Report and Confusion Matrix
print(classification_report(y_test, prediction))

pd.DataFrame(
    confusion_matrix(y_test, prediction),
    index = [['actual', 'actual'], ['Non_Disaster', 'Disaster']],
    columns = [['predicted', 'predicted'], ['Non_Disaster', 'Disaster']])

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       886
           1       0.79      0.68      0.73       637

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.79      1523



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,Non_Disaster,Disaster
actual,Non_Disaster,771,115
actual,Disaster,204,433


### Test Dataset

In [33]:
df_test = pd.read_csv('test.csv')

In [34]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [35]:
df_test.drop(['id','keyword','location'], axis = 1, inplace=True)

In [37]:
df_test = cv.fit_transform(df_test.text).toarray()

In [40]:
final_predictions = selected_classifier.predict(df_test)