In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("disaster-tweet.csv")
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
data.drop(["location"],axis=1,inplace=True)
data.head()

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
data=data.dropna()
data=data.reset_index()

In [6]:
data["text"]=data["text"].apply(lambda x:x.lower())

In [7]:
data.head()

Unnamed: 0,index,id,keyword,text,target
0,31,48,ablaze,@bbcmtd wholesale markets ablaze http://t.co/l...,1
1,32,49,ablaze,we always try to bring the heavy. #metal #rt h...,0
2,33,50,ablaze,#africanbaze: breaking news:nigeria flag set a...,1
3,34,52,ablaze,crying out for more! set me ablaze,0
4,35,53,ablaze,on plus side look at the sky last night it was...,0


In [8]:
data["text"][1]

'we always try to bring the heavy. #metal #rt http://t.co/yao1e0xngw'

In [9]:
data["keyword"][1]

'ablaze'

In [10]:
data["keyword"].value_counts()

fatalities               45
deluge                   42
armageddon               42
damage                   41
harm                     41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [11]:
!pip install nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer
import string
import re



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def data_processing(text):
    text= text.lower()
    text = re.sub('<br />', '', text)
    text = re.sub(r"https\S+|www\S+|http\S+", '', text, flags = re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text_tokens = word_tokenize(text)
    stop_words=stopwords.words("english")
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [13]:
data.text = data['text'].apply(data_processing)

In [14]:
data.duplicated().sum()

0

In [15]:
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data
data.text = data['text'].apply(lambda x: stemming(x))

In [16]:
data.head()

Unnamed: 0,index,id,keyword,text,target
0,31,48,ablaze,bbcmtd wholesale markets ablaze,1
1,32,49,ablaze,always try bring heavy metal rt,0
2,33,50,ablaze,africanbaze breaking newsnigeria flag set abla...,1
3,34,52,ablaze,crying set ablaze,0
4,35,53,ablaze,plus side look sky last night ablaze,0


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [18]:
data.drop(["index","id","keyword"],axis=1,inplace=True)
data.head()

Unnamed: 0,text,target
0,bbcmtd wholesale markets ablaze,1
1,always try bring heavy metal rt,0
2,africanbaze breaking newsnigeria flag set abla...,1
3,crying set ablaze,0
4,plus side look sky last night ablaze,0


In [19]:
vect = TfidfVectorizer()
X=vect.fit_transform(data["text"])
y=data["target"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [22]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
logreg_acc = accuracy_score(logreg_pred, y_test)
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

Test accuracy: 80.41%


In [23]:
svc = LinearSVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
svc_acc = accuracy_score(svc_pred, y_test)
print("Test accuracy: {:.2f}%".format(svc_acc*100))

Test accuracy: 79.26%


In [24]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb_pred = mnb.predict(X_test)
mnb_acc = accuracy_score(mnb_pred, y_test)
print("Test accuracy: {:.2f}%".format(mnb_acc*100))

Test accuracy: 80.19%


In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {'C':[0.1, 1, 10, 100], 'loss':['hinge', 'squared_hinge']}
grid = GridSearchCV(svc, param_grid, refit=True)
grid.fit(X_train, y_train)



GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'loss': ['hinge', 'squared_hinge']})

In [27]:
grid.best_params_

{'C': 1, 'loss': 'hinge'}

In [28]:
svc = LinearSVC(C = 1, loss='hinge')
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
svc_acc = accuracy_score(svc_pred, y_test)
print("Test accuracy: {:.2f}%".format(svc_acc*100))

Test accuracy: 80.23%


In [42]:
test_data=pd.read_csv("disaster-tweet-test.csv")
sample=test_data.copy()
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [43]:
test_data.drop(["id","keyword","location"],axis=1,inplace=True)
test_data.text =test_data['text'].apply(data_processing)
test_data.text = test_data['text'].apply(lambda x: stemming(x))
test_data=vect.transform(test_data["text"])
prediction=svc.predict(test_data)

In [44]:
prediction

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [45]:
sample["outcome"]=prediction
sample

Unnamed: 0,id,keyword,location,text,outcome
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,,,Storm in RI worse than last hurricane. My city...,1
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,1
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,1
