In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re


# Read in files

In [None]:
# Read the training & test sets as pandas dataframes
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.sample(10)

Unnamed: 0,id,keyword,location,text,target
2871,4127,drought,Meereen,Pizza drought is over I just couldn't anymore...,0
5378,7675,panic,"Elsewhere, NZ",Lose bus card.\nPanic.\nKind bus driver.\nRepl...,0
2592,3721,destroyed,,Russian customs destroyed a total of 319 tons ...,0
4765,6781,lightning,Elchilicitanierraversal,#NowPlaying 'The Lightning Strike' de Snow Pat...,0
5084,7251,nuclear%20disaster,,If i tweet daily #Fukushima #Japan global nucl...,1
4180,5937,hazard,Australia,#Lifestyle Û÷It makes me sickÛª: Baby clothe...,0
5156,7355,obliterate,United Kingdom,@klavierstuk doesn't so LVG is forced into the...,0
484,700,attacked,"Los Angeles, CA",@envw98 @NickCoCoFree @JulieDiCaro @jdabe80 I ...,0
4716,6706,lava,"Medan,Indonesia",@YoungHeroesID Lava Blast &amp; Power Red #Pan...,0
5249,7505,oil%20spill,"Las Vegas, Nevada",Refugio oil spill may have been costlier bigge...,1


In [None]:
test.sample(10)

Unnamed: 0,id,keyword,location,text
2425,8107,rescued,"London, UK",So @edsheeran just rescued this year's #Fusion...
83,286,ambulance,World,2 held with heroin in ambulance http://t.co/d9...
2934,9720,tragedy,"SÌ©te, France, (foto c.1968)",@TEDMED\nI remember my friend Jeff Weisberg sa...
1886,6357,hostages,China,#hot C-130 specially modified to land in a st...
719,2338,collapse,Los Angeles,Technical Collapse -&gt; http://t.co/BfJB5H4t...
1273,4190,drown,East Carolina University'19 ??,Fuck around and drown ???? https://t.co/fr5z9W...
2200,7364,obliterate,trashcan somewhere in hell,@MilesWithTate but seeing the writers oblitera...
2744,9138,suicide%20bomb,Nigeria,#Bestnaijamade: 16yr old PKK suicide bomber wh...
2041,6865,mass%20murder,The Three Broomsticks,Watching Murder in the First and you get the p...
2800,9312,survive,"Lima, Peru",#ScienceDaily Parental experience may help cor...


# Preprocessing

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Use Python stopwords dict to remove generally meaningless words
def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

#PorterStemmer to truncate words to root
def perform_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [None]:
# Preprocessing; apply the relevant functions to both the train & test data
train.replace('', np.nan, inplace=True)
test.fillna("", inplace=True)

train["text"] = train["text"].apply(preprocess_text)
test["text"] = test["text"].apply(preprocess_text)

train["text"] = train["text"].apply(word_tokenize)
test["text"] = test["text"].apply(word_tokenize)

train["text"] = train["text"].apply(remove_stopwords)
test["text"] = test["text"].apply(remove_stopwords)

train["text"] = train["text"].apply(perform_stemming)
test["text"] = test["text"].apply(perform_stemming)

In [None]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[deed, reason, earthquak, may, allah, forgiv, us]",1
1,4,,,"[forest, fire, near, la, rong, sask, canada]",1
2,5,,,"[resid, ask, shelter, place, notifi, offic, ev...",1
3,6,,,"[peopl, receiv, wildfir, evacu, order, califor...",1
4,7,,,"[got, sent, photo, rubi, alaska, smoke, wildfi...",1


In [None]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,"[happen, terribl, car, crash]"
1,2,,,"[heard, earthquak, differ, citi, stay, safe, e..."
2,3,,,"[forest, fire, spot, pond, gees, flee, across,..."
3,9,,,"[apocalyps, light, spokan, wildfir]"
4,11,,,"[typhoon, soudelor, kill, china, taiwan]"


Change the text column from word tokens back into a string

In [None]:
train["text"] = train["text"].apply(lambda tokens: " ".join(tokens))
test["text"] = test["text"].apply(lambda tokens: " ".join(tokens))

In [None]:
# Text column is back into a string as in raw data, except it has been preprocessed.
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv us,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1


# Set up TF-IDF Vectorization

Use term frequency, inverse document frequency (common NLP method for text data) to capture importance of relevant words in the tweets and match to their classification (target) \ 

Then use Multinomial Naive Bayes to do final classification on Training and Validation sets

In [None]:
# Limit vocabulary to 5000 words
vectorizer = TfidfVectorizer(max_features=5000)

# x represents the text data with a TF-IDF matrix. y holds the target labels
x = vectorizer.fit_transform(train["text"])
y = train["target"]

In [None]:
x = vectorizer.fit_transform(train["text"])
y = train["target"]

In [None]:
# Split the training data into both training & validation sets

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Train model with Naive Bayes

In [None]:
model = MultinomialNB()
model.fit(x_train, y_train)


In [None]:
# Predict on validation set

y_pred = model.predict(x_val)

# Model Evaluation

In [None]:
# Use accuracy score to evaluate Naive Bayes
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get overall classification report to evaluate model performance
report = classification_report(y_val, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.80
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.80      0.80      1523



# Predict on Test Data

In [None]:
x_test = vectorizer.transform(test["text"])
test_predictions = model.predict(x_test)

In [None]:
test_predictions_submission = pd.DataFrame({"id": test["id"], "tweet": test["text"], "target": test_predictions})

In [None]:
test_predictions_submission.head()

Unnamed: 0,id,tweet,target
0,0,happen terribl car crash,1
1,2,heard earthquak differ citi stay safe everyon,1
2,3,forest fire spot pond gees flee across street ...,1
3,9,apocalyps light spokan wildfir,1
4,11,typhoon soudelor kill china taiwan,1
