<a href="https://www.kaggle.com/code/varrvinter/disaster-tweets-prediction?scriptVersionId=134919928" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import nltk, re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df = df.dropna()

df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,birmingham,@bbcmtd wholesale markets ablaze http://t.co/l...,1
32,49,ablaze,est. september 2012 - bristol,we always try to bring the heavy. #metal #rt h...,0
33,50,ablaze,africa,#africanbaze: breaking news:nigeria flag set a...,1
34,52,ablaze,"philadelphia, pa",crying out for more! set me ablaze,0
35,53,ablaze,"london, uk",on plus side look at the sky last night it was...,0


In [5]:
patterns = [
    r"@[a-zA-Z]+", # User mentions
    r"#+", # Hashtag character
    r"https?://[\S]+", # URLs
    r"[\S]+©[\S]+", # Weird token
    r"[\.\,\?\!\:\;\'\"]{2,}", # Punctuation
    r"[\`\~\%\^\&\*\(\)\-\+\=\_\[\]\{\}\|\\\<\>]+", # Other characters
    r"&amp" # &amp character
]

for pattern in patterns:
    df['text'] = df['text'].apply(lambda x: re.sub(pattern, "", x))

df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,birmingham,wholesale markets ablaze,1
32,49,ablaze,est. september 2012 - bristol,we always try to bring the heavy. metal rt,0
33,50,ablaze,africa,africanbaze: breaking news:nigeria flag set ab...,1
34,52,ablaze,"philadelphia, pa",crying out for more! set me ablaze,0
35,53,ablaze,"london, uk",on plus side look at the sky last night it was...,0


In [6]:
df['text'] = df['text'].apply(lambda x: word_tokenize(x))

In [7]:
df['keyword'] = df['keyword'].apply(lambda x: word_tokenize(x))

In [8]:
stop_words = set(stopwords.words('english'))

df['text'] = df['text'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

In [9]:
feature = df[['keyword', 'text']]
target = df[['target']]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=49)

In [11]:
X_train['combined'] = X_train['keyword'].astype(str) + ' ' + X_train['text'].astype(str)
X_test['combined'] = X_test['keyword'].astype(str) + ' ' + X_test['text'].astype(str)

In [12]:
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train['combined'])
y_train_1d = y_train.values.ravel()
nb_model = MultinomialNB(alpha=1.2, fit_prior=False, class_prior=[1.0, 0.8])
nb_model.fit(X_train_vectors, y_train_1d)

In [13]:
X_test_vectors = vectorizer.transform(X_test['combined'])
y_pred = nb_model.predict(X_test_vectors)
y_proba = nb_model.predict_proba(X_test_vectors)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
print('ROC AUC:', roc_auc)

Mean Squared Error: 0.1830708661417323
Mean Absolute Error: 0.1830708661417323
Accuracy: 0.8169291338582677
Precision: 0.8371428571428572
Recall: 0.6943127962085308
F1-score: 0.7590673575129534
ROC AUC: 0.8640692070786858


In [14]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

X_test = test_df[['keyword', 'text']].copy()
X_test.loc[:, 'combined'] = X_test['keyword'].fillna('').values + ' ' + X_test['text'].values

X_test_vectors = vectorizer.transform(X_test['combined'])

y_pred = nb_model.predict(X_test_vectors)

test_pred = pd.DataFrame({'id': test_df['id'], 'target': y_pred})
test_pred.to_csv('predictions.csv', index=False)