<a href="https://colab.research.google.com/github/joshi14/NLP_Disaster_tweet/blob/main/NLP_Disaster_tweet_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing with Disaster Tweets
Predict which Tweets are about real disasters and which ones are not

---




In [None]:
pip install emoji



In [None]:
import pandas as pd
import numpy as np       
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')

import re
import string
import emoji

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Loading the Training and testing dataset

In [None]:
train_df=pd.read_csv('/content/PMC1377868.txt')
train_df.head()

In [None]:
test_df=pd.read_csv('/content/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Cleaning and processing the tweets

Functions to clean tweets and remove punctuations.

In [None]:
def cleanTweet(txt):
    txt = re.sub(r'@[A-Za-z0-9_]+','',txt)
    txt = re.sub(r'#','',txt)
    txt = re.sub(r'RT : ','',txt)
    txt = re.sub(r'\n','',txt)
    txt = ''.join((x for x in txt if not x.isdigit()))
    # to remove emojis
    txt = re.sub(emoji.get_emoji_regexp(), r"", txt)
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+','',txt)
    txt = re.sub(r"https?://\S+|www\.\S+","",txt)
    txt = re.sub(r"<.*?>","",txt)
    return txt

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

Cleaning the training dataset

In [None]:
train_df['text']=train_df['text'].str.lower()
train_df['text']=train_df.text.apply(cleanTweet)
train_df['text']=train_df.text.apply(remove_punct)
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,people receive wildfires evacuation orders in...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,the out of control wild fires in california ...,1
7610,10871,,,m utckm s of volcano hawaii,1
7611,10872,,,police investigating after an ebike collided w...,1


Cleaning the testing dataset

In [None]:
test_df['text']=test_df['text'].str.lower()
test_df['text']=test_df.text.apply(cleanTweet)
test_df['text']=test_df.text.apply(remove_punct)
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills in china and taiwan
...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety faste...
3259,10865,,,storm in ri worse than last hurricane my citya...
3260,10868,,,green line derailment in chicago
3261,10874,,,meg issues hazardous weather outlook hwo


Lemmatizing and removing stopwords

In [None]:
from nltk.stem import WordNetLemmatizer
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
  return [lemmatizer.lemmatize(word,'v') for word in w_tokenizer.tokenize(text)]

In [None]:
#lemmatizing training data
train_df['processed_text']=train_df.text.apply(lemmatize_text)
#removing stop words in training data
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
train_df['processed_text']=train_df['processed_text'].apply(lambda x: [item for item in x if item not in stop_words])
train_df

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,our deeds are the reason of this earthquake ma...,1,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,forest fire near la ronge sask canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,all residents asked to shelter in place are be...,1,"[residents, ask, shelter, place, notify, offic..."
3,6,,,people receive wildfires evacuation orders in...,1,"[people, receive, wildfires, evacuation, order..."
4,7,,,just got sent this photo from ruby alaska as s...,1,"[get, send, photo, ruby, alaska, smoke, wildfi..."
...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, hold, bridge, collapse, ne..."
7609,10870,,,the out of control wild fires in california ...,1,"[control, wild, fire, california, even, northe..."
7610,10871,,,m utckm s of volcano hawaii,1,"[utckm, volcano, hawaii]"
7611,10872,,,police investigating after an ebike collided w...,1,"[police, investigate, ebike, collide, car, lit..."


In [None]:
#lemmatizing testing data
test_df['processed_text']=test_df.text.apply(lemmatize_text)
#removing stop words in testing data
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
test_df['processed_text']=test_df['processed_text'].apply(lambda x: [item for item in x if item not in stop_words])
test_df

Unnamed: 0,id,keyword,location,text,processed_text
0,0,,,just happened a terrible car crash,"[happen, terrible, car, crash]"
1,2,,,heard about earthquake is different cities sta...,"[hear, earthquake, different, cities, stay, sa..."
2,3,,,there is a forest fire at spot pond geese are ...,"[forest, fire, spot, pond, geese, flee, across..."
3,9,,,apocalypse lighting spokane wildfires,"[apocalypse, light, spokane, wildfires]"
4,11,,,typhoon soudelor kills in china and taiwan,"[typhoon, soudelor, kill, china, taiwan]"
...,...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety faste...,"[earthquake, safety, los, angeles, ûò, safety..."
3259,10865,,,storm in ri worse than last hurricane my citya...,"[storm, ri, worse, last, hurricane, cityampoth..."
3260,10868,,,green line derailment in chicago,"[green, line, derailment, chicago]"
3261,10874,,,meg issues hazardous weather outlook hwo,"[meg, issue, hazardous, weather, outlook, hwo]"


Removing words that appear just once in the document and removing single letters if present

In [None]:
from collections import Counter 
from itertools import chain

c = Counter (chain.from_iterable(train_df['processed_text']))

In [None]:
train_df['processed_text'] = [' '.join([j for j in i if c[j]>1 and len(j)>1]) for i in train_df['processed_text']]
train_df

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,our deeds are the reason of this earthquake ma...,1,deeds reason earthquake may allah forgive us
1,4,,,forest fire near la ronge sask canada,1,forest fire near la canada
2,5,,,all residents asked to shelter in place are be...,1,residents ask shelter place officer evacuation...
3,6,,,people receive wildfires evacuation orders in...,1,people receive wildfires evacuation order cali...
4,7,,,just got sent this photo from ruby alaska as s...,1,get send photo alaska smoke wildfires pour school
...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,two giant crane hold bridge collapse nearby home
7609,10870,,,the out of control wild fires in california ...,1,control wild fire california even northern par...
7610,10871,,,m utckm s of volcano hawaii,1,utckm volcano hawaii
7611,10872,,,police investigating after an ebike collided w...,1,police investigate ebike collide car little po...


In [None]:
test_df['processed_text'] = [' '.join([j for j in i if c[j]>1 and len(j)>1]) for i in test_df['processed_text']]
test_df

Unnamed: 0,id,keyword,location,text,processed_text
0,0,,,just happened a terrible car crash,happen terrible car crash
1,2,,,heard about earthquake is different cities sta...,hear earthquake different cities stay safe eve...
2,3,,,there is a forest fire at spot pond geese are ...,forest fire spot pond flee across street canno...
3,9,,,apocalypse lighting spokane wildfires,apocalypse light wildfires
4,11,,,typhoon soudelor kills in china and taiwan,typhoon soudelor kill china taiwan
...,...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety faste...,earthquake safety los angeles ûò safety
3259,10865,,,storm in ri worse than last hurricane my citya...,storm worse last hurricane hit look like bomb ...
3260,10868,,,green line derailment in chicago,green line derailment chicago
3261,10874,,,meg issues hazardous weather outlook hwo,meg issue hazardous weather outlook hwo


Removing non-english words from tweets

In [None]:
words = set(nltk.corpus.words.words())

def clean_sentence(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())

In [None]:
train_df['processed_text']=train_df.processed_text.apply(clean_sentence)
train_df

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,our deeds are the reason of this earthquake ma...,1,reason earthquake may forgive us
1,4,,,forest fire near la ronge sask canada,1,forest fire near la canada
2,5,,,all residents asked to shelter in place are be...,1,ask shelter place officer evacuation shelter p...
3,6,,,people receive wildfires evacuation orders in...,1,people receive evacuation order
4,7,,,just got sent this photo from ruby alaska as s...,1,get send photo smoke pour school
...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,two giant crane hold bridge collapse nearby home
7609,10870,,,the out of control wild fires in california ...,1,control wild fire even northern part state tro...
7610,10871,,,m utckm s of volcano hawaii,1,volcano
7611,10872,,,police investigating after an ebike collided w...,1,police investigate collide car little rider su...


In [None]:
test_df['processed_text']=test_df.processed_text.apply(clean_sentence)
test_df

Unnamed: 0,id,keyword,location,text,processed_text
0,0,,,just happened a terrible car crash,happen terrible car crash
1,2,,,heard about earthquake is different cities sta...,hear earthquake different stay safe everyone
2,3,,,there is a forest fire at spot pond geese are ...,forest fire spot pond flee across street canno...
3,9,,,apocalypse lighting spokane wildfires,apocalypse light
4,11,,,typhoon soudelor kills in china and taiwan,typhoon kill china
...,...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety faste...,earthquake safety  safety
3259,10865,,,storm in ri worse than last hurricane my citya...,storm worse last hurricane hit look like bomb ...
3260,10868,,,green line derailment in chicago,green line derailment
3261,10874,,,meg issues hazardous weather outlook hwo,issue hazardous weather outlook


# Building the model

Splitting the training data 

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(train_df['processed_text'],train_df['target'],test_size=0.20,random_state=42,shuffle=True)

Vectorizing the dataset

In [None]:
tfidfvectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)
tfidf_train = tfidfvectorizer.fit_transform(x_train.values.astype('U'))
tfidf_val = tfidfvectorizer.transform(x_val.values.astype('U'))
tfidf_test=tfidfvectorizer.transform(test_df['processed_text'].values.astype('U'))

Using logistic regression to train the model

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=0).fit(tfidf_train, y_train)

Performing cross validation on the model

In [None]:
from sklearn.model_selection import GridSearchCV
penalty=['11', '12']
C=np.logspace(0,4,18)
hyperparameters=dict() #penalty-penalty, C=C)
clf=GridSearchCV (logistic_model, hyperparameters, cv=5, verbose=8) 
best_model=clf.fit(tfidf_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] .................................... , score=0.785, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.777, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.796, total=   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


[CV] .................................... , score=0.773, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.767, total=   0.1s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished


In [None]:
print("Model's accuracy ",clf.score(tfidf_train, y_train))

Model's accuracy  0.851888341543514


Predicting with the remaining part of training data to check accuracy

In [None]:
from sklearn.metrics import accuracy_score
y_pred=logistic_model.predict(tfidf_val)
accuracy_score(y_val,y_pred)

0.7826657912015759

Using the model to predict for testing data

In [None]:
submission_df = pd.read_csv("/content/sample_submission.csv")
submission_df["target"] = logistic_model.predict(tfidf_test)
submission_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
submission_df.to_csv("submission.csv", index=False)