In [14]:
import jupyter_black

jupyter_black.load(line_length=120)

In [15]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import nltk
# nltk.download("stopwords")

# Data preparation

In [16]:
stop_words = set(stopwords.words("english"))


def count_stop_words(text):
    words = text.split()
    stop_words_count = sum(1 for word in words if word in stop_words)
    return stop_words_count


def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "<url>", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@\w+", "<mention>", text)  # Remove mentions
    text = re.sub(r"|\#", "", text)  # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text_tokens = text.split()
    filtered_words = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_words)


def has_url(text):
    return 1 if re.search(r"http\S+|www\S+|https\S+", text) else 0

In [17]:
df = pd.read_csv("datasets/train.csv")

df["text_length"] = df["text"].astype(str).apply(len)
df["word_count"] = df["text"].astype(str).apply(lambda x: len(x.split()))
df["stop_words_count"] = df["text"].apply(count_stop_words)
df["has_url"] = df["text"].apply(has_url)
df["clean_text"] = df["text"].apply(clean_text)

In [18]:
df.describe()

Unnamed: 0,id,target,text_length,word_count,stop_words_count,has_url
count,7613.0,7613.0,7613.0,7613.0,7613.0,7613.0
mean,5441.934848,0.42966,101.037436,14.903586,3.761198,0.522265
std,3137.11609,0.49506,33.781325,5.732604,3.20863,0.499537
min,1.0,0.0,7.0,1.0,0.0,0.0
25%,2734.0,0.0,78.0,11.0,1.0,0.0
50%,5408.0,0.0,107.0,15.0,3.0,1.0
75%,8146.0,1.0,133.0,19.0,6.0,1.0
max,10873.0,1.0,157.0,31.0,18.0,1.0


In [19]:
df.head()

Unnamed: 0,id,keyword,location,text,target,text_length,word_count,stop_words_count,has_url,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13,5,0,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7,0,0,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22,9,0,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8,1,0,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16,6,0,got sent photo ruby alaska smoke wildfires pou...


# Vectorization

In [20]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

## count vectorizer

In [21]:
count_vectorizer = CountVectorizer(
    lowercase=True,
    stop_words="english",
    max_features=1000,
    ngram_range=(1, 2),
)
X_train = count_vectorizer.fit_transform(df_train["clean_text"])
X_test = count_vectorizer.transform(df_test["clean_text"])

## TF-IDF

In [22]:
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    max_features=1000,
    ngram_range=(1, 2),
)
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train["clean_text"])
X_test_tfidf = tfidf_vectorizer.transform(df_test["clean_text"])

# Training

## SVC - CountVectorizer

In [23]:
svm = SVC(kernel="linear", C=1.0, random_state=42)
svm.fit(X_train, df_train["target"])
y_pred = svm.predict(X_test)

print(classification_report(df_test["target"], y_pred))
print("Accuracy:", accuracy_score(df_test["target"], y_pred))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82       874
           1       0.79      0.68      0.73       649

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.78      1523

Accuracy: 0.7872619829284307


## XGBoost - CountVectorizer

In [24]:
bst = XGBClassifier()
bst.fit(X_train, df_train["target"])
preds = bst.predict(X_test)

print(classification_report(df_test["target"], preds))
print("Accuracy:", accuracy_score(df_test["target"], preds))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82       874
           1       0.82      0.61      0.70       649

    accuracy                           0.78      1523
   macro avg       0.79      0.75      0.76      1523
weighted avg       0.78      0.78      0.77      1523

Accuracy: 0.7760998030203545


## SVC - TF-IDF

In [25]:
svm = SVC(kernel="linear", C=1.0, random_state=42)
svm.fit(X_train_tfidf, df_train["target"])
y_pred = svm.predict(X_test_tfidf)

print(classification_report(df_test["target"], y_pred))
print("Accuracy:", accuracy_score(df_test["target"], y_pred))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       874
           1       0.78      0.68      0.73       649

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523

Accuracy: 0.7820091923834537


## XGBoost - TF-IDF

In [26]:
bst = XGBClassifier()
bst.fit(X_train_tfidf, df_train["target"])
preds = bst.predict(X_test_tfidf)

print(classification_report(df_test["target"], preds))
print("Accuracy:", accuracy_score(df_test["target"], preds))

              precision    recall  f1-score   support

           0       0.76      0.88      0.82       874
           1       0.80      0.63      0.70       649

    accuracy                           0.77      1523
   macro avg       0.78      0.75      0.76      1523
weighted avg       0.78      0.77      0.77      1523

Accuracy: 0.7728168089297439
