# Disaster_Tweets_Classification

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import nltk
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [2]:
# Read the tweets
tweets=pd.read_csv("nlp-getting-started/train.csv")
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
tweets.shape

(7613, 5)

In [4]:
tweets.set_index('id',inplace=True)
tweets.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Drop duplicates
tweets.drop_duplicates(inplace = True)

In [6]:
tweets.shape

(7561, 4)

In [7]:
tweets.isnull().sum()

keyword       61
location    2500
text           0
target         0
dtype: int64

### We can see that Location and Keywords are missing in some tweets
### Location is missing in most of the tweets and hence I will not be using location as a feature in my model

### I will be replacing the NULL keywords with 'No_Keyword' in my data cleaning

In [8]:
tweets.keyword=np.where(tweets['keyword'].isnull()==True,'no_keyword',tweets['keyword'])

In [9]:
tweets.drop('location',axis=1,inplace=True)
tweets.head()

Unnamed: 0_level_0,keyword,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,no_keyword,Our Deeds are the Reason of this #earthquake M...,1
4,no_keyword,Forest fire near La Ronge Sask. Canada,1
5,no_keyword,All residents asked to 'shelter in place' are ...,1
6,no_keyword,"13,000 people receive #wildfires evacuation or...",1
7,no_keyword,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
tweets.isnull().sum()

keyword    0
text       0
target     0
dtype: int64

In [11]:
# Using TweetTokenizer to tokenize every tweet along with the #
tweet_tok = TweetTokenizer()

In [12]:
# Cleaning all tweets. Converting the string into lower case and removing stopwords and punctuations
def clean_txt(sent):
    stemmer_s = SnowballStemmer("english")
    tokens = tweet_tok.tokenize(sent.lower())
    stop_updated = stopwords.words("english") + list(punctuation) + ["..."]
    text = [stemmer_s.stem(term) for term in tokens if term not in stop_updated and len(term) > 2] 
    res = " ".join(text)
    return res

In [13]:
tweets['clean_text'] = tweets.text.apply(clean_txt)

In [14]:
tweets.head()

Unnamed: 0_level_0,keyword,text,target,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,no_keyword,Our Deeds are the Reason of this #earthquake M...,1,deed reason #earthquak may allah forgiv
4,no_keyword,Forest fire near La Ronge Sask. Canada,1,forest fire near rong sask canada
5,no_keyword,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...
6,no_keyword,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv #wildfir evacu order calif..."
7,no_keyword,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi #alaska smoke #wildfir pou...


In [15]:
# Preparing data for testing
X = tweets.clean_text.values
y = tweets.target.values

In [16]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [17]:
# Using TfidfVectorizer to get a significant number for each word in each tweet
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [18]:
# Applying Naive Bayes to classify the tweets
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
# Predicting the classes on the testing data
y_pred = classifier.predict(X_test_tfidf)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [21]:
# Checking accuracy of the model
round(accuracy_score(y_test,y_pred)*100,2)

81.75

### The model has got 81.75% accuracy with the testing data