In [4]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import gc

# Define file paths for Jupyter Notebook
train_file_path = r"C:/Users/harid/Downloads/TRAINING_DATA.txt"
real_file_path = r"C:/Users/harid/Downloads/REAL_DATA.txt"

# Load training data
train_data = pd.read_csv(train_file_path, sep="\t", header=None, names=["Label", "Text"], encoding="utf-8")

# Read and clean REAL_DATA.txt
with open(real_file_path, "r", encoding="utf-8", errors="replace") as file:
    lines = file.readlines()

# Cleaning any irregularities
cleaned_lines = [line.strip() for line in lines if line.strip()]

# Ensure we only take valid rows
real_texts = []
for line in cleaned_lines:
    parts = line.split("\t", 1)  # Ensure we split only on the first tab
    if len(parts) == 1:
        real_texts.append(parts[0])  # If there's no tab, assume it's text
    else:
        real_texts.append(parts[1])  # Take only the text part

# Create DataFrame
real_data = pd.DataFrame({"Text": real_texts})

# Convert labels to integers
train_data["Label"] = train_data["Label"].astype(int)

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    return text

# Apply preprocessing
train_data["Cleaned_Text"] = train_data["Text"].apply(preprocess_text)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data["Cleaned_Text"], train_data["Label"], test_size=0.2, random_state=42
)

# Convert text to numerical features using TF-IDF with optimized parameters
vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words="english", max_features=30000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Free up memory
gc.collect()

# Define XGBoost parameters and enable GPU acceleration
param_grid = {
    'n_estimators': [300],
    'learning_rate': [0.1],
    'max_depth': [7],
    'tree_method': ['hist'],
    'device': ['cuda']
}

# Use StratifiedKFold for better cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform GridSearchCV to optimize hyperparameters
grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=stratified_kfold, scoring="accuracy", n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

# Train the best model
classifier = grid_search.best_estimator_
classifier.fit(X_train_tfidf, y_train)

# Evaluate model
predictions = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(classification_report(y_test, predictions))

# Function to classify new sentences
def classify_sentences(sentences):
    sentences = [preprocess_text(sentence) for sentence in sentences]
    sentences_tfidf = vectorizer.transform(sentences)
    return classifier.predict(sentences_tfidf)

# Classify real dataset
real_data["Predicted_Label"] = classify_sentences(real_data["Text"])
print(real_data.head())

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Accuracy: 0.5031825795644891
              precision    recall  f1-score   support

           0       0.53      0.12      0.20      1504
           1       0.50      0.89      0.64      1481

    accuracy                           0.50      2985
   macro avg       0.51      0.51      0.42      2985
weighted avg       0.52      0.50      0.42      2985

                                                Text  Predicted_Label
0  Yo no creo que a nadie le haya encantado un pe...                1
1  No va a resolver sus problemas de crédito o me...                1
2                                Te encantará este !                1
3  Yo estaba a volar a un aeropuerto varias horas...                1
4  ( Maid En Manhattan , The Wedding Planner , Je...                1
