<a href="https://colab.research.google.com/github/jokefun022/Google-Drive/blob/main/Copy_of_22_08_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gensim



In [None]:
import argparse
# from Model.BiLSTM_Attention import run_bilstm # This import is not needed as run_bilstm is defined in a previous cell
# from Run_ML import run_all_ml # This import might also cause an error if Run_ML is not available

def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--model", choices=["bilstm", "ml"], required=False, default="ml") # Set required to False and provide a default
    p.add_argument("--data_path", type=str, default="/content/Complete Data With Emoji.csv")
    p.add_argument("--text_col", type=str, default="Tweet_Text_With_Emoji")
    p.add_argument("--label_col", type=str, default="Label")
    # DL params
    p.add_argument("--epochs", type=int, default=5)
    p.add_argument("--batch_size", type=int, default=64)
    p.add_argument("--max_len", type=int, default=80)
    p.add_argument("--vocab_size", type=int, default=30000)
    p.add_argument("--embedding_dim", type=int, default=128)
    p.add_argument("--lstm_units", type=int, default=64)
    p.add_argument("--lower", type=lambda s: s.lower() in ["true","1","yes"], default=True)
    # ML params
    p.add_argument("--use_emoji", type=lambda s: s.lower() in ["true","1","yes"], default=True)
    return p.parse_args([]) # Pass an empty list to avoid argparse issues in Colab

if __name__ == "__main__":
    args = parse_args()
    if args.model == "bilstm":
        run_bilstm(args)
    else:
        run_all_ml(args)


===== LogisticRegression =====
Accuracy: 0.9630
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1211
           1       0.94      0.92      0.93       145
           2       0.96      0.94      0.95       628
           3       0.86      0.95      0.90        80
           4       0.94      0.93      0.94        71

    accuracy                           0.96      2135
   macro avg       0.94      0.94      0.94      2135
weighted avg       0.96      0.96      0.96      2135


===== SGDClassifier =====
Accuracy: 0.9803
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1211
           1       0.96      0.94      0.95       145
           2       0.98      0.97      0.97       628
           3       0.94      0.96      0.95        80
           4       0.94      0.96      0.95        71

    accuracy                           0.98      2135
   macro avg       0.96      0.97     

In [None]:
import argparse, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from scipy.sparse import hstack
import re

# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def emoji_fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec

def build_emoji_tfidf(train_texts, test_texts):
    Xtr_e, Xte_e, _ = emoji_fit_transform(train_texts, test_texts)
    return Xtr_e, Xte_e

def build_text_features():
    word_vec = ("w", TfidfVectorizer(analyzer="word", ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b|[^\w\s]", min_df=2))
    char_vec = ("c", TfidfVectorizer(analyzer="char_wb", ngram_range=(2,5), min_df=2))
    return FeatureUnion([word_vec, char_vec])

def run_all_ml(args):
    df = pd.read_csv(args.data_path, encoding="utf-8")
    df = df.dropna(subset=[args.text_col, args.label_col]).reset_index(drop=True)

    X_text = df[args.text_col].astype(str)
    y_raw = df[args.label_col].astype(str)

    le = LabelEncoder()
    y = le.fit_transform(y_raw)

    X_train_txt, X_test_txt, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42, stratify=y)

    # Text TF-IDF
    text_fu = build_text_features()
    X_train_text = text_fu.fit_transform(X_train_txt)
    X_test_text  = text_fu.transform(X_test_txt)

    # Emoji features (optional)
    if args.use_emoji:
        X_train_emoji, X_test_emoji = build_emoji_tfidf(X_train_txt, X_test_txt)
        X_train = hstack([X_train_text, X_train_emoji])
        X_test  = hstack([X_test_text,  X_test_emoji])
    else:
        X_train, X_test = X_train_text, X_test_text

    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=None),
        "SGDClassifier": SGDClassifier(max_iter=2000, tol=1e-3),
        "GaussianNB": GaussianNB(),  # requires dense
        "KNeighbors": KNeighborsClassifier(n_neighbors=5),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "AdaBoost": AdaBoostClassifier(n_estimators=300, random_state=42),
        "Bagging": BaggingClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=400, random_state=42, n_jobs=-1),
        "GradientBoosting": GradientBoostingClassifier(random_state=42)
    }

    results = {}
    for name, model in models.items():
        print(f"\n===== {name} =====")
        if name == "GaussianNB":
            model.fit(X_train.toarray(), y_train)
            y_pred = model.predict(X_test.toarray())
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred, target_names=le.classes_))
        results[name] = acc

    print("\n==== Summary Accuracies ====")
    for k, v in sorted(results.items(), key=lambda kv: kv[1], reverse=True):
        print(f"{k}: {v:.4f}")

ap = argparse.ArgumentParser()
ap.add_argument("--data_path", type=str, default="/content/Complete Data With Emoji.csv")
ap.add_argument("--text_col", type=str, default="Tweet_Text_With_Emoji")
ap.add_argument("--label_col", type=str, default="Label")
ap.add_argument("--use_emoji", type=lambda s: s.lower() in ["true","1","yes"], default=True)
args = ap.parse_args([]) # Pass an empty list to avoid argparse issues in Colab
run_all_ml(args)


===== LogisticRegression =====
Accuracy: 0.9630
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1211
           1       0.94      0.92      0.93       145
           2       0.96      0.94      0.95       628
           3       0.86      0.95      0.90        80
           4       0.94      0.93      0.94        71

    accuracy                           0.96      2135
   macro avg       0.94      0.94      0.94      2135
weighted avg       0.96      0.96      0.96      2135


===== SGDClassifier =====
Accuracy: 0.9799
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1211
           1       0.96      0.94      0.95       145
           2       0.98      0.97      0.97       628
           3       0.93      0.96      0.94        80
           4       0.94      0.94      0.94        71

    accuracy                           0.98      2135
   macro avg       0.96      0.96     

In [None]:
import re, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec


In [None]:
import re, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack

# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec

def build_text_union():
    w = ("w", TfidfVectorizer(analyzer="word", ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b|[^\w\s]", min_df=2))
    c = ("c", TfidfVectorizer(analyzer="char_wb", ngram_range=(2,5), min_df=2))
    return FeatureUnion([w, c])

def build_emoji_tfidf(train_texts, test_texts):
    Xtr_e, Xte_e, _ = fit_transform(train_texts, test_texts)
    return Xtr_e, Xte_e

def concat_features(X_text_train, X_text_test, X_emoji_train=None, X_emoji_test=None):
    if X_emoji_train is not None and X_emoji_test is not None:
        return hstack([X_text_train, X_emoji_train]), hstack([X_text_test, X_emoji_test])
    return X_text_train, X_test_text

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

class Attention(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.dense = layers.Dense(1)

    def call(self, inputs):
        # inputs: (batch, time, feat)
        scores = tf.nn.tanh(self.dense(inputs))
        weights = tf.nn.softmax(scores, axis=1)
        context = tf.reduce_sum(weights * inputs, axis=1)
        return context

def run_bilstm(args):
    df = pd.read_csv(args.data_path, encoding="utf-8")
    df = df.dropna(subset=[args.text_col, args.label_col]).reset_index(drop=True)

    X = df[args.text_col].astype(str)
    if args.lower: X = X.str.lower()
    y_raw = df[args.label_col].astype(str)

    le = LabelEncoder()
    y = le.fit_transform(y_raw)
    num_classes = len(le.classes_)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    tokenizer = Tokenizer(num_words=args.vocab_size, lower=args.lower, filters="")
    tokenizer.fit_on_texts(X_train)
    Xtr = tokenizer.texts_to_sequences(X_train)
    Xte = tokenizer.texts_to_sequences(X_test)

    Xtr = pad_sequences(Xtr, maxlen=args.max_len, padding="post", truncating="post")
    Xte = pad_sequences(Xte, maxlen=args.max_len, padding="post", truncating="post")

    vocab_size = min(args.vocab_size, len(tokenizer.word_index) + 1)

    inp = layers.Input(shape=(args.max_len,))
    x = layers.Embedding(vocab_size, args.embedding_dim, mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(args.lstm_units, return_sequences=True))(x)
    x = Attention()(x)
    x = layers.Dropout(0.3)(x)
    if num_classes == 2:
        out = layers.Dense(1, activation="sigmoid")(x)
        loss = "binary_crossentropy"
    else:
        out = layers.Dense(num_classes, activation="softmax")(x)
        loss = "sparse_categorical_crossentropy"

    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])
    model.summary()

    model.fit(Xtr, y_train, validation_split=0.1, epochs=args.epochs, batch_size=args.batch_size, verbose=2)

    if num_classes == 2:
        preds = (model.predict(Xte, verbose=0).ravel() >= 0.5).astype(int)
    else:
        preds = model.predict(Xte, verbose=0).argmax(axis=1)

    from sklearn.metrics import accuracy_score, classification_report
    acc = accuracy_score(y_test, preds)
    print(f"[BiLSTM+Attention] Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds, target_names=le.classes_))


In [None]:
import re, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack

# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec

def build_text_union():
    w = ("w", TfidfVectorizer(analyzer="word", ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b|[^\w\s]", min_df=2))
    c = ("c", TfidfVectorizer(analyzer="char_wb", ngram_range=(2,5), min_df=2))
    return FeatureUnion([w, c])

def build_emoji_tfidf(train_texts, test_texts):
    Xtr_e, Xte_e, _ = fit_transform(train_texts, test_texts)
    return Xtr_e, Xte_e

def concat_features(X_text_train, X_text_test, X_emoji_train=None, X_emoji_test=None):
    if X_emoji_train is not None and X_emoji_test is not None:
        return hstack([X_text_train, X_emoji_train]), hstack([X_text_test, X_emoji_test])
    return X_text_train, X_text_test

In [None]:
import re, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack

# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec

def build_text_union():
    w = ("w", TfidfVectorizer(analyzer="word", ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b|[^\w\s]", min_df=2))
    c = ("c", TfidfVectorizer(analyzer="char_wb", ngram_range=(2,5), min_df=2))
    return FeatureUnion([w, c])

def build_emoji_tfidf(train_texts, test_texts):
    Xtr_e, Xte_e, _ = fit_transform(train_texts, test_texts)
    return Xtr_e, Xte_e

def concat_features(X_text_train, X_text_test, X_emoji_train=None, X_emoji_test=None):
    if X_emoji_train is not None and X_emoji_test is not None:
        return hstack([X_text_train, X_emoji_train]), hstack([X_text_test, X_emoji_test])
    return X_text_train, X_text_test

def prepare_features(train_texts, test_texts, use_emoji=True):
    text_union = build_text_union()
    Xtr_t = text_union.fit_transform(train_texts)
    Xte_t = text_union.transform(test_texts)
    if use_emoji:
        Xtr_e, Xte_e = build_emoji_tfidf(train_texts, test_texts)
        return concat_features(Xtr_t, Xte_t, Xtr_e, Xte_e)
    return Xtr_t, Xte_t

In [None]:
!pip install gensim
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def train_word2vec(texts, vector_size=100, window=5, min_count=2):
    sentences = [t.split() for t in texts]
    model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, workers=4)
    return model

def build_embedding_matrix(tokenizer: Tokenizer, w2v_model, embedding_dim=100):
    vocab_size = len(tokenizer.word_index) + 1
    emb = np.random.normal(scale=0.6, size=(vocab_size, embedding_dim))
    for word, idx in tokenizer.word_index.items():
        if word in w2v_model.wv:
            emb[idx] = w2v_model.wv[word]
    return emb



In [None]:
import argparse
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from scipy.sparse import hstack
import re

# Definitions from cell Jq-vOSFdFRg1
# Simple emoji-only tokenizer & TF-IDF features
EMOJI_REGEX = re.compile(r'[\U0001F100-\U0001FAFF\U00002700-\U000027BF\U00002600-\U000026FF]')

def extract_emojis(s: str):
    return " ".join(EMOJI_REGEX.findall(s))

def emoji_fit_transform(train_texts, test_texts):
    train_emoji = [extract_emojis(t) for t in train_texts]
    test_emoji  = [extract_emojis(t) for t in test_texts]
    vec = TfidfVectorizer(analyzer="char", ngram_range=(1,3), min_df=1)
    Xtr = vec.fit_transform(train_emoji)
    Xte = vec.transform(test_emoji)
    return Xtr, Xte, vec

def build_emoji_tfidf(train_texts, test_texts):
    Xtr_e, Xte_e, _ = emoji_fit_transform(train_texts, test_texts)
    return Xtr_e, Xte_e

def build_text_features():
    word_vec = ("w", TfidfVectorizer(analyzer="word", ngram_range=(1,2), token_pattern=r"(?u)\b\w+\b|[^\w\s]", min_df=2))
    char_vec = ("c", TfidfVectorizer(analyzer="char_wb", ngram_range=(2,5), min_df=2))
    return FeatureUnion([word_vec, char_vec])

def run_all_ml(args):
    df = pd.read_csv(args.data_path, encoding="utf-8")
    df = df.dropna(subset=[args.text_col, args.label_col]).reset_index(drop=True)

    X_text = df[args.text_col].astype(str)
    y_raw = df[args.label_col].astype(str)

    le = LabelEncoder()
    y = le.fit_transform(y_raw)

    X_train_txt, X_test_txt, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42, stratify=y)

    # Text TF-IDF
    text_fu = build_text_features()
    X_train_text = text_fu.fit_transform(X_train_txt)
    X_test_text  = text_fu.transform(X_test_txt)

    # Emoji features (optional)
    if args.use_emoji:
        X_train_emoji, X_test_emoji = build_emoji_tfidf(X_train_txt, X_test_txt)
        X_train = hstack([X_train_text, X_train_emoji])
        X_test  = hstack([X_test_text,  X_test_emoji])
    else:
        X_train, X_test = X_train_text, X_test_text

    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=None),
        "SGDClassifier": SGDClassifier(max_iter=2000, tol=1e-3),
        "GaussianNB": GaussianNB(),  # requires dense
        "KNeighbors": KNeighborsClassifier(n_neighbors=5),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "AdaBoost": AdaBoostClassifier(n_estimators=300, random_state=42),
        "Bagging": BaggingClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=400, random_state=42, n_jobs=-1),
        "GradientBoosting": GradientBoostingClassifier(random_state=42)
    }

    results = {}
    for name, model in models.items():
        print(f"\n===== {name} =====")
        if name == "GaussianNB":
            model.fit(X_train.toarray(), y_train)
            y_pred = model.predict(X_test.toarray())
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred, target_names=le.classes_))
        results[name] = acc

    print("\n==== Summary Accuracies ====")
    for k, v in sorted(results.items(), key=lambda kv: kv[1], reverse=True):
        print(f"{k}: {v:.4f}")

# Definitions from cell O0TrAYoBEqjC
class Attention(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.dense = layers.Dense(1)

    def call(self, inputs):
        # inputs: (batch, time, feat)
        scores = tf.nn.tanh(self.dense(inputs))
        weights = tf.nn.softmax(scores, axis=1)
        context = tf.reduce_sum(weights * inputs, axis=1)
        return context

def run_bilstm(args):
    df = pd.read_csv(args.data_path, encoding="utf-8")
    df = df.dropna(subset=[args.text_col, args.label_col]).reset_index(drop=True)

    X = df[args.text_col].astype(str)
    if args.lower: X = X.str.lower()
    y_raw = df[args.label_col].astype(str)

    le = LabelEncoder()
    y = le.fit_transform(y_raw)
    num_classes = len(le.classes_)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    tokenizer = Tokenizer(num_words=args.vocab_size, lower=args.lower, filters="")
    tokenizer.fit_on_texts(X_train)
    Xtr = tokenizer.texts_to_sequences(X_train)
    Xte = tokenizer.texts_to_sequences(X_test)

    Xtr = pad_sequences(Xtr, maxlen=args.max_len, padding="post", truncating="post")
    Xte = pad_sequences(Xte, maxlen=args.max_len, padding="post", truncating="post")

    vocab_size = min(args.vocab_size, len(tokenizer.word_index) + 1)

    inp = layers.Input(shape=(args.max_len,))
    x = layers.Embedding(vocab_size, args.embedding_dim, mask_zero=True)(inp)
    x = layers.Bidirectional(layers.LSTM(args.lstm_units, return_sequences=True))(x)
    x = Attention()(x)
    x = layers.Dropout(0.3)(x)
    if num_classes == 2:
        out = layers.Dense(1, activation="sigmoid")(x)
        loss = "binary_crossentropy"
    else:
        out = layers.Dense(num_classes, activation="softmax")(x)
        loss = "sparse_categorical_crossentropy"

    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])
    model.summary()

    model.fit(Xtr, y_train, validation_split=0.1, epochs=args.epochs, batch_size=args.batch_size, verbose=2)

    if num_classes == 2:
        preds = (model.predict(Xte, verbose=0).ravel() >= 0.5).astype(int)
    else:
        preds = model.predict(Xte, verbose=0).argmax(axis=1)

    from sklearn.metrics import accuracy_score, classification_report
    acc = accuracy_score(y_test, preds)
    print(f"[BiLSTM+Attention] Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds, target_names=le.classes_))


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--model", choices=["bilstm", "ml"], required=False, default="ml") # Set required to False and provide a default
    p.add_argument("--data_path", type=str, default="/content/Complete Data With Emoji.csv")
    p.add_argument("--text_col", type=str, default="Tweet_Text_With_Emoji")
    p.add_argument("--label_col", type=str, default="Label")
    # DL params
    p.add_argument("--epochs", type=int, default=5)
    p.add_argument("--batch_size", type=int, default=64)
    p.add_argument("--max_len", type=int, default=80)
    p.add_argument("--vocab_size", type=int, default=30000)
    p.add_argument("--embedding_dim", type=int, default=128)
    p.add_argument("--lstm_units", type=int, default=64)
    p.add_argument("--lower", type=lambda s: s.lower() in ["true","1","yes"], default=True)
    # ML params
    p.add_argument("--use_emoji", type=lambda s: s.lower() in ["true","1","yes"], default=True)
    return p.parse_args([]) # Pass an empty list to avoid argparse issues in Colab

args = parse_args()
if args.model == "bilstm":
    run_bilstm(args)
else:
    run_all_ml(args)


===== LogisticRegression =====
Accuracy: 0.9630
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1211
           1       0.94      0.92      0.93       145
           2       0.96      0.94      0.95       628
           3       0.86      0.95      0.90        80
           4       0.94      0.93      0.94        71

    accuracy                           0.96      2135
   macro avg       0.94      0.94      0.94      2135
weighted avg       0.96      0.96      0.96      2135


===== SGDClassifier =====
Accuracy: 0.9803
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1211
           1       0.96      0.95      0.96       145
           2       0.98      0.97      0.97       628
           3       0.92      0.97      0.95        80
           4       0.96      0.94      0.95        71

    accuracy                           0.98      2135
   macro avg       0.96      0.97     