In [1]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedKFold
import re
import nltk
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('kaggle_parsed_dataset.csv')

In [5]:
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d', '', text)  # Remove digits
    return text.strip()

In [6]:
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [7]:
X = df['cleaned_text']
y = df['oh_label']

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords.words('english'))
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [9]:
model = LogisticRegression()

In [10]:
cv = StratifiedKFold(n_splits=5)
cross_val_scores = cross_val_score(model, X_tfidf, y, cv=cv, scoring='accuracy')

In [11]:
print("Cross-Validation Accuracy Scores:", cross_val_scores)
print("Mean Accuracy:", np.mean(cross_val_scores))

Cross-Validation Accuracy Scores: [0.80625    0.80681818 0.77556818 0.77954545 0.7998863 ]
Mean Accuracy: 0.793613623443072


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [14]:
print("\nFinal Model Evaluation:")
print("Accuracy on test data:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test,y_pred))


Final Model Evaluation:
Accuracy on test data: 0.7869318181818182

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.96      0.86      1178
           1       0.84      0.44      0.58       582

    accuracy                           0.79      1760
   macro avg       0.81      0.70      0.72      1760
weighted avg       0.80      0.79      0.76      1760

