In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# File paths
training_txt = "./TRAINING_DATA.txt"
training_csv = "./TRAINING_DATA.csv"

# Function to convert TXT to CSV
def convert_txt_to_csv(input_file, output_file):
    try:
        df = pd.read_csv(
            input_file, 
            sep="\t", 
            header=None, 
            names=["Label", "Text"], 
            encoding="utf-8", 
            quoting=3,  # Prevents error due to unescaped quotes
            on_bad_lines='skip'  # Skips problematic lines
        )
        df.to_csv(output_file, index=False, encoding="utf-8")
        print(f"File successfully converted and saved to: {output_file}")
        return df
    except Exception as e:
        print(f"Error processing {input_file}: {e}")
        return None

# Convert dataset
train_df = convert_txt_to_csv(training_txt, training_csv)

# Data Cleaning & Exploratory Data Analysis (EDA)
def explore_data(df, name):
    print(f"\n{name} Dataset Overview")
    print("-" * 50)
    print(df.info())
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Word count distribution
    df['word_count'] = df['Text'].apply(lambda x: len(str(x).split()))
    plt.figure(figsize=(6, 4))
    sns.histplot(df['word_count'], bins=30, kde=True)
    plt.title(f"Word Count Distribution in {name} Dataset")
    plt.xlabel("Number of Words")
    plt.ylabel("Frequency")
    plt.show()
    
    # Word cloud
    text_corpus = ' '.join(df['Text'].dropna().astype(str))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_corpus)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {name} Dataset")
    plt.show()

# Run EDA
if train_df is not None:
    explore_data(train_df, "Training")

# Text Classification Model with Enhanced Hyperparameter Tuning
def build_and_evaluate_model(df):
    X = df['Text']
    y = df['Label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', MultinomialNB())
    ])
    
    param_grid = {
        'tfidf__max_df': [0.5, 0.75, 1.0],
        'tfidf__min_df': [1, 3, 5],
        'tfidf__ngram_range': [(1,1), (1,2), (2,2)],
        'clf__alpha': [0.01, 0.1, 1.0, 5.0, 10.0]
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Best Parameters:", grid_search.best_params_)
    
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    
    print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Build and evaluate the classifier
if train_df is not None:
    build_and_evaluate_model(train_df)


# **Results**

RangeIndex: 17877 entries, 0 to 17876
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   17877 non-null  int64 
 1   Text    17877 non-null  object
dtypes: int64(1), object(1)
memory usage: 279.5+ KB
None

First 5 rows:
   Label                                               Text
0      1  Cuando conocí a Janice en 2013 , una familia n...
1      0  Hwang habló en Sur de este año por Southwest M...
2      1  Usted podría pensar Katy Perry y Robert Pattin...
3      1  Cualquiera que haya volado los cielos del crea...
4      1  Bueno , este cantante tendrá un LARGO tiempo p...

Missing Values:
Label    0
Text     0
dtype: int64


Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best Parameters: {'clf__alpha': 10.0, 'tfidf__max_df': 0.5, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 1)}

Model Accuracy: 0.44351230425055926

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.53      0.49      1788
           1       0.43      0.35      0.39      1788

    accuracy                           0.44      3576
   macro avg       0.44      0.44      0.44      3576
weighted avg       0.44      0.44      0.44      3576