<a href="https://colab.research.google.com/github/jokefun022/Google-Colab/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================#
#  Colab Setup & Dependencies   #
# ==============================#
!pip -q install -U scikit-learn imbalanced-learn emoji transformers datasets accelerate xgboost
!pip -q install -U tensorflow  # Colab usually has TF preinstalled; this ensures recent 2.x

import os, re, math, json, random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from collections import Counter, defaultdict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from datasets import Dataset

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("TF:", tf.__version__)
print("Torch:", torch.__version__)

# ==============================#
#        Data Loading           #
# ==============================#

# Option A: direct path if already in the Colab VM (e.g., uploaded in left Files panel)
PREFERRED_PATHS = [
    '/content/Complete Data With Emoji.csv',         # typical Colab upload path
    '/content/drive/MyDrive/Complete Data With Emoji.csv',  # Google Drive common location
    '/content/Complete_Data_With_Emoji.csv',         # fallback naming
    '/content/dataset.csv'
]

csv_path = None
for p in PREFERRED_PATHS:
    if os.path.exists(p):
        csv_path = p
        break

if csv_path is None:
    # Option B: interactive upload
    from google.colab import files
    up = files.upload()
    csv_path = list(up.keys())[0]  # first uploaded file

print("Using CSV:", csv_path)

# Read with robust encoding fallbacks
def read_csv_robust(path):
    for enc in ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252']:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise last_err

df = read_csv_robust(csv_path)
print("Shape:", df.shape)
df.head(3)

# ==============================#
#   Column Auto-Detection       #
# ==============================#

TEXT_CANDIDATES  = ['text','tweet','comment','sentence','content','message','body','post','clean_text','raw_text']
LABEL_CANDIDATES = ['label','target','class','sentiment','category','y','labels','tag']

def autodetect_columns(frame):
    cols_lower = {c.lower(): c for c in frame.columns}
    text_col = next((cols_lower[c] for c in TEXT_CANDIDATES if c in cols_lower), None)
    label_col = next((cols_lower[c] for c in LABEL_CANDIDATES if c in cols_lower), None)

    # If label missing but a small integer-like column exists, guess it
    if label_col is None:
        for c in frame.columns:
            if frame[c].dtype in [np.int64, np.int32] or pd.api.types.is_integer_dtype(frame[c]):
                if frame[c].nunique() <= max(20, int(len(frame)*0.05)+2):  # likely a label
                    label_col = c
                    break

    # If text missing, pick first object column with long-ish strings
    if text_col is None:
        obj_cols = [c for c in frame.columns if frame[c].dtype == object]
        if obj_cols:
            text_col = max(obj_cols, key=lambda c: frame[c].astype(str).str.len().mean())

    return text_col, label_col

TEXT_COL, LABEL_COL = autodetect_columns(df)
print("Detected TEXT_COL:", TEXT_COL, "| LABEL_COL:", LABEL_COL)

assert TEXT_COL is not None, "Couldn't detect the text column. Please rename your text column to something like 'text' and rerun."
assert LABEL_COL is not None, "Couldn't detect the label column. Please rename your label column to something like 'label' and rerun."

# Drop rows with missing
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).reset_index(drop=True)

# ==============================#
#     Basic Preprocessing       #
# ==============================#

URL_RE   = re.compile(r'https?://\S+|www\.\S+')
USER_RE  = re.compile(r'@\w+')
HASH_RE  = re.compile(r'#(\w+)')
WS_RE    = re.compile(r'\s+')

def clean_text_keep_emojis(s: str) -> str:
    s = str(s)
    s = URL_RE.sub(' ', s)
    s = USER_RE.sub(' ', s)
    # keep hashtag word, drop '#'
    s = HASH_RE.sub(r'\1', s)
    s = s.lower()
    s = WS_RE.sub(' ', s).strip()
    return s

df['clean'] = df[TEXT_COL].apply(clean_text_keep_emojis)

# Encode labels
le = LabelEncoder()
df['y'] = le.fit_transform(df[LABEL_COL])
id2label = {i: lab for i, lab in enumerate(le.classes_)}
label2id = {lab: i for i, lab in enumerate(le.classes_)}
num_labels = len(le.classes_)
print("Classes:", le.classes_)

# Train/Val split
train_df, test_df = train_test_split(
    df[['clean','y']], test_size=0.2, random_state=RANDOM_SEED, stratify=df['y']
)
train_df = train_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

# ==============================#
#    Helper: Evaluation         #
# ==============================#

def evaluate_and_report(y_true, y_pred, model_name, labels_map):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average='macro')
    print(f"\n[{model_name}]  Accuracy: {acc:.4f}  |  Macro-F1: {f1m:.4f}")
    print(classification_report(y_true, y_pred, target_names=[labels_map[i] for i in sorted(labels_map)]))
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)
    return {"model": model_name, "accuracy": acc, "macro_f1": f1m}

metrics_log = []

# ==============================#
#   Classic ML (TF-IDF)        #
# ==============================#

X_train = train_df['clean'].values
y_train = train_df['y'].values
X_test  = test_df['clean'].values
y_test  = test_df['y'].values

# TF-IDF: mix word + char n-grams (works well for Roman Urdu + emojis)
tfidf = TfidfVectorizer(
    analyzer='char_wb', ngram_range=(3,5), min_df=2, max_features=200000
)
tfidf_words = TfidfVectorizer(
    analyzer='word', ngram_range=(1,2), min_df=2, max_features=200000
)

# We concatenate char and word features by stacking in a FeatureUnion-like way
from scipy.sparse import hstack
Xtr_char = tfidf.fit_transform(X_train)
Xte_char = tfidf.transform(X_test)
Xtr_word = tfidf_words.fit_transform(X_train)
Xte_word = tfidf_words.transform(X_test)
Xtr = hstack([Xtr_char, Xtr_word]).tocsr()
Xte = hstack([Xte_char, Xte_word]).tocsr()

# Optional: Address imbalance with RandomOverSampler on features
ros = RandomOverSampler(random_state=RANDOM_SEED)
Xtr_bal, ytr_bal = ros.fit_resample(Xtr, y_train)

# 1) Multinomial Naive Bayes
nb = MultinomialNB(alpha=0.5)
nb.fit(Xtr_bal, ytr_bal)
pred_nb = nb.predict(Xte)
metrics_log.append(evaluate_and_report(y_test, pred_nb, "MultinomialNB (TF-IDF)", id2label))

# 2) Linear SVM (LinearSVC)
svm = LinearSVC()
svm.fit(Xtr_bal, ytr_bal)
pred_svm = svm.predict(Xte)
metrics_log.append(evaluate_and_report(y_test, pred_svm, "LinearSVC (TF-IDF)", id2label))

# 3) Logistic Regression (LBFGS, multinomial)
lr = LogisticRegression(max_iter=2000, n_jobs=-1)
lr.fit(Xtr_bal, ytr_bal)
pred_lr = lr.predict(Xte)
metrics_log.append(evaluate_and_report(y_test, pred_lr, "LogisticRegression (TF-IDF)", id2label))

# 4) Random Forest
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=RANDOM_SEED)
rf.fit(Xtr_bal, ytr_bal)
pred_rf = rf.predict(Xte)
metrics_log.append(evaluate_and_report(y_test, pred_rf, "RandomForest (TF-IDF)", id2label))

# 5) (Optional) XGBoost on TF-IDF
try:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(
        n_estimators=600, max_depth=8, learning_rate=0.1, subsample=0.9, colsample_bytree=0.8,
        reg_lambda=1.0, objective='multi:softprob', num_class=num_labels, n_jobs=-1,
        random_state=RANDOM_SEED, tree_method='hist'
    )
    xgb.fit(Xtr_bal, ytr_bal, verbose=False)
    pred_xgb = xgb.predict(Xte)
    metrics_log.append(evaluate_and_report(y_test, pred_xgb, "XGBoost (TF-IDF)", id2label))
except Exception as e:
    print("XGBoost skipped due to:", e)

# ==============================#
#   Deep Learning: BiLSTM      #
# ==============================#

# Tokenization for Keras (keep emojis)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB = 60000
MAX_LEN   = 96

tok = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>", filters='')  # keep punctuation & emojis
tok.fit_on_texts(train_df['clean'].tolist())

Xtr_seq = tok.texts_to_sequences(train_df['clean'].tolist())
Xte_seq = tok.texts_to_sequences(test_df['clean'].tolist())

Xtr_pad = pad_sequences(Xtr_seq, maxlen=MAX_LEN, padding='post', truncating='post')
Xte_pad = pad_sequences(Xte_seq, maxlen=MAX_LEN, padding='post', truncating='post')
ytr = tf.keras.utils.to_categorical(train_df['y'], num_classes=num_labels)
yte = tf.keras.utils.to_categorical(test_df['y'], num_classes=num_labels)

# Class weights for imbalance
class_counts = Counter(train_df['y'].tolist())
total = sum(class_counts.values())
class_weight = {cls: total/(num_labels*cnt) for cls, cnt in class_counts.items()}
print("Class weights (BiLSTM):", class_weight)

def build_bilstm_model(vocab_size, num_labels, max_len=MAX_LEN, emb_dim=128, lstm_units=128, rate=0.3):
    inputs = keras.Input(shape=(max_len,), dtype='int32')
    x = layers.Embedding(vocab_size, emb_dim, input_length=max_len, mask_zero=False)(inputs)
    x = layers.SpatialDropout1D(rate)(x)
    x = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True))(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(rate)(x)
    outputs = layers.Dense(num_labels, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=2e-3),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

bilstm = build_bilstm_model(vocab_size=min(MAX_VOCAB, len(tok.word_index)+1), num_labels=num_labels)
bilstm.summary()

cb = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]

hist = bilstm.fit(
    Xtr_pad, ytr,
    validation_split=0.15,
    epochs=10,
    batch_size=128,
    class_weight=class_weight,
    verbose=1,
)
pred_bilstm = bilstm.predict(Xte_pad, batch_size=256)
pred_bilstm_lbl = pred_bilstm.argmax(axis=1)
metrics_log.append(evaluate_and_report(test_df['y'].values, pred_bilstm_lbl, "BiLSTM (Keras)", id2label))

# ==============================#
#  Deep Learning: DistilBERT   #
# ==============================#

MODEL_NAME = "distilbert-base-uncased"  # English baseline; works reasonably with Roman Urdu + emojis
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Build HuggingFace Dataset
hf_train = Dataset.from_pandas(train_df.rename(columns={"clean":"text", "y":"label"}))
hf_test  = Dataset.from_pandas(test_df.rename(columns={"clean":"text", "y":"label"}))

def tok_func(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=128
    )

hf_train = hf_train.map(tok_func, batched=True)
hf_test  = hf_test.map(tok_func, batched=True)

# Set format for PyTorch
hf_train = hf_train.remove_columns([c for c in hf_train.column_names if c not in ['input_ids','attention_mask','label']])
hf_test  = hf_test.remove_columns([c for c in hf_test.column_names if c not in ['input_ids','attention_mask','label']])
hf_train.set_format(type='torch')
hf_test.set_format(type='torch')

id2label_hf = {i: id2label[i] for i in range(num_labels)}
label2id_hf = {v: k for k, v in id2label_hf.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels, id2label=id2label_hf, label2id=label2id_hf
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1m = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "macro_f1": f1m}

training_args = TrainingArguments(
    output_dir="/content/bert_out",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none"  # disable wandb by default
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
eval_res = trainer.evaluate()
print("DistilBERT Eval:", eval_res)

# Predictions & full report
bert_preds = np.argmax(trainer.predict(hf_test).predictions, axis=1)
metrics_log.append(evaluate_and_report(test_df['y'].values, bert_preds, "DistilBERT", id2label))

# ==============================#
#       Save All Outputs        #
# ==============================#

# Metrics table
metrics_df = pd.DataFrame(metrics_log).sort_values(by="macro_f1", ascending=False)
metrics_csv_path = "/content/model_metrics.csv"
metrics_df.to_csv(metrics_csv_path, index=False)
print("\nSaved metrics to:", metrics_csv_path)
display(metrics_df)

# Per-model predictions
out_dir = "/content/preds"
os.makedirs(out_dir, exist_ok=True)

def save_predictions(name, y_pred):
    out = test_df.copy()
    out['pred_id'] = y_pred
    out['pred_label'] = out['pred_id'].map(id2label)
    out['true_label'] = out['y'].map(id2label)
    p = os.path.join(out_dir, f"{name.replace(' ','_')}_preds.csv")
    out[[ 'clean','true_label','pred_label' ]].to_csv(p, index=False)
    print("Saved:", p)

save_predictions("MultinomialNB", pred_nb)
save_predictions("LinearSVC", pred_svm)
save_predictions("LogisticRegression", pred_lr)
save_predictions("RandomForest", pred_rf)
try:
    save_predictions("XGBoost", pred_xgb)
except:
    pass
save_predictions("BiLSTM", pred_bilstm_lbl)
save_predictions("DistilBERT", bert_preds)

print("\nDone. Check /content/model_metrics.csv and /content/preds/*.csv")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.12.0 requires tensorflow==2.19.0, but you have tensorflow 2.20.0 which

TypeError: 'NoneType' object is not subscriptable