# Kaggle - Natural Language Processing with Disaster Tweets

This Python Notebook is to train and validate the datasets to detect whether or 
not a tweet from Twitter is a disastrous tweet.

Train model: Transformer + SVM

Author: Han-Elliot Nguyen<br>Email: hanelliotn@gmail.com

Start date: July 5, 2023<br>End date: July 8, 2023

In [1]:
# Import libraries

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
# Read train dataset

train_data = pd.read_csv("./datasets/train.csv")
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
# Read test dataset

test_data = pd.read_csv("./datasets/test.csv")
test_data

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [4]:
# Process train & test dataset

X_train = train_data.text.to_numpy()
X_test_val = test_data.text.to_numpy()

y_train = train_data.target.to_numpy()

In [5]:
# Split train and test datasets

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [6]:
# Transform sentences using Sentence Transfers

from sentence_transformers import SentenceTransformer
transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')

X_train = transformer.encode(X_train)
X_test = transformer.encode(X_test)
X_val = transformer.encode(X_test_val)

In [7]:
# Develop SVC train model

model = SVC(C=1.0)
model.fit(X_train, y_train)

In [8]:
# Predict split test data and display accuracy score

y_pred = model.predict(X_test)
accuracy_score = accuracy_score(y_test, y_pred)
accuracy_score

0.8279711096520026

In [9]:
# Predict original test data and display result

y_pred = model.predict(X_val)
result = pd.DataFrame({"id": test_data.id, "target": y_pred})
result

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [10]:
# Submit result

result.to_csv("datasets/submission.csv", index=False)