In [None]:
# Data handling libraries

import re
import json
import numpy as np
import pandas as pd
import stanza
from sklearn.preprocessing import StandardScaler
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
from nltk.corpus import stopwords

# Scikit-learn modeling libraries

from sklearn.svm import LinearSVC, SDGClassifier
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline 

In [6]:

path = '../data/Kaggle2025/train.jsonl'
df = pd.read_json(path, lines=True)
df = json_normalize(df.to_dict(orient='records'))

In [None]:
# Lemmatizing in French Language


nlp = stanza.Pipeline(lang='fr', processors='tokenize,mwt,pos,lemma')


def extract_text(row):
    if pd.notna(row.get("extended_tweet.full_text")):
        return row["extended_tweet.full_text"]
    elif pd.notna(row.get("quoted_status.extended_tweet.full_text")):
        return row["quoted_status.extended_tweet.full_text"]
    elif pd.notna(row.get("quoted_status.text")):
        return row["quoted_status.text"]
    else:
        return row.get("text", "")

df["clean_text"] = df.apply(extract_text, axis=1)


def clean_french_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)               
    text = re.sub(r"@\w+", " ", text)                  
    text = re.sub(r"#(\w+)", r" \1 ", text)            
    text = re.sub(r"[^\w\s√Ä-√ø]", " ", text)            
    text = re.sub(r"\s+", " ", text).strip()           
    return text
df["clean_text"] = df['clean_text'].apply(clean_french_tweet)


def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words if word.lemma is not None]
    return " ".join(lemmas)
df['lemmatized_text'] = df['clean_text'].apply(lemmatize_text)
df['lemmatized_text'].head()

2025-11-17 23:29:55 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 6.35MB/s]                    
2025-11-17 23:29:55 INFO: Downloaded file to C:\Users\Asus\stanza_resources\resources.json
2025-11-17 23:29:56 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-11-17 23:29:56 INFO: Using device: cpu
2025-11-17 23:29:56 INFO: Loading: tokenize
2025-11-17 23:29:56 INFO: Loading: mwt
2025-11-17 23:29:56 INFO: Loading: pos
2025-11-17 23:29:58 INFO: Loading: lemma
2025-11-17 23:29:58 INFO: Done loading processors!


0    direct jean castex et olivier v√©ran annoncer d...
1    direct jean castex et olivier v√©ran annoncer d...
2    on √™tre de accord pour le cons√©quence √©conomiq...
3    renforcer le capacit√© de d√©pistage et le actio...
4    on moi dire dans le oreillette que le patient ...
Name: lemmatized_text, dtype: object

In [41]:
french_stopwords = stopwords.words('french')
Vectorize = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=3,
    max_features=50000,
    stop_words=french_stopwords
)
X = Vectorize.fit_transform(df['lemmatized_text'])
print(X.shape)
y = df['label'].values

(154914, 50000)


In [42]:
# Train-test splitting

print(len(y) == X.shape[0])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify = y, random_state = 40
)

print("Train size:", X_train.shape[0], "| Val size:", X_val.shape[0])

True
Train size: 123931 | Val size: 30983


In [43]:
# Model design
svm = LinearSVC(class_weight="balanced", max_iter=5000)

# Grid of C values (inverse of regularization strength)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Grid search
grid = GridSearchCV(
    svm,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit on train data
grid.fit(X_train, y_train)

# Predict and evaluate
y_pred = grid.predict(X_val)

print("Best C:", grid.best_params_["C"])
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C: 0.1

Validation Accuracy: 0.6328631830358584

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.64      0.65     16535
           1       0.60      0.62      0.61     14448

    accuracy                           0.63     30983
   macro avg       0.63      0.63      0.63     30983
weighted avg       0.63      0.63      0.63     30983



In [44]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV


# Define model
model = SGDClassifier(class_weight='balanced', max_iter=1000)

# Hyperparameter grid
param_grid = {
    'loss': ['hinge', 'squared_hinge'],
    'alpha': [1e-3, 1e-4, 1e-5]  # Note: SGDClassifier uses 'alpha' = 1/C
}

grid = GridSearchCV(
    model,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit to training set
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)

# Evaluate
y_pred = grid.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Params: {'alpha': 0.0001, 'loss': 'hinge'}

Validation Accuracy: 0.6268276151437885

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.63      0.64     16535
           1       0.60      0.62      0.61     14448

    accuracy                           0.63     30983
   macro avg       0.63      0.63      0.63     30983
weighted avg       0.63      0.63      0.63     30983



In [None]:
# FAST TEXT Implementation without Lemmatized words( Cause Lemmatization reduced accuracy )

In [65]:
from gensim.models.fasttext import load_facebook_vectors
from gensim.utils import simple_preprocess
import gensim.downloader as api
from gensim.models import KeyedVectors
import urllib.request
import gzip
import shutil
import os

fasttext_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz"
local_path = "cc.fr.300.vec.gz"
vec_file = "cc.fr.300.vec"

if not os.path.exists(vec_file):
    print("‚è≥ Downloading FastText French vectors...")
    urllib.request.urlretrieve(fasttext_url, local_path)
    print("‚úÖ Downloaded. Unzipping...")
    with gzip.open(local_path, 'rb') as f_in:
        with open(vec_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("‚úÖ Unzipped.")


print("üîÅ Loading word vectors into memory...")
ft_model = KeyedVectors.load_word2vec_format(vec_file)


def text_to_vector(text, model, dim=300):
    tokens = simple_preprocess(text, deacc=True)  
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


print("üîÑ Converting tweets to vectors...")
X = np.vstack(df["clean_text"].apply(lambda x: text_to_vector(x, ft_model)))
y = df["label"].values

print("‚úÖ Vector shape:", X.shape)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = LinearSVC(class_weight="balanced", max_iter=5000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("\n‚úÖ Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nüßæ Classification Report:\n")
print(classification_report(y_val, y_pred))


üîÅ Loading word vectors into memory...
üîÑ Converting tweets to vectors...
‚úÖ Vector shape: (154914, 300)

‚úÖ Validation Accuracy: 0.610076493560985

üßæ Classification Report:

              precision    recall  f1-score   support

           0       0.64      0.62      0.63     16535
           1       0.58      0.60      0.59     14448

    accuracy                           0.61     30983
   macro avg       0.61      0.61      0.61     30983
weighted avg       0.61      0.61      0.61     30983



In [47]:
# Feature Selection

import pandas as pd
import numpy as np
from datetime import datetime

def extract_metadata(df):
    meta = pd.DataFrame()

    # Grab metadata (or fallback from quoted_status)
    meta["followers_count"] = df.get("user.followers_count", df["quoted_status.user.followers_count"])
    meta["friends_count"] = df.get("user.friends_count", df["quoted_status.user.friends_count"])
    meta["statuses_count"] = df.get("user.statuses_count", df["quoted_status.user.statuses_count"])
    meta["favourites_count"] = df.get("user.favourites_count", df["quoted_status.user.favourites_count"])
    meta["listed_count"] = df.get("user.listed_count", df["quoted_status.user.listed_count"])
    meta["verified"] = df.get("user.verified", df["quoted_status.user.verified"]).astype(float)

    # Followers-to-friends ratio (avoid div by 0)
    meta["followers_to_friends_ratio"] = meta["followers_count"] / (meta["friends_count"] + 1)


    return meta
X_meta = extract_metadata(df)
X_meta = X_meta.fillna(X_meta.median())

y = df["label"]
print("X_meta shape:", X_meta.shape)
print(X_meta.head())
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X_meta, y)

print("\nFeature Scores (F-test):")
for name, score in zip(X_meta.columns, selector.scores_):
    print(f"{name}: {score:.2f}")


X_meta shape: (154914, 7)
   followers_count  friends_count  statuses_count  favourites_count  \
0        1338833.0          747.0             333             14154   
1        1338833.0          747.0            3028              8582   
2          89020.0          579.0            4238              1229   
3          89020.0          579.0            1152                19   
4          89020.0          579.0            1252              1375   

   listed_count  verified  followers_to_friends_ratio  
0             5       1.0                 1789.883690  
1             1       1.0                 1789.883690  
2            27       0.0                  179.035714  
3            92       0.0                  179.035714  
4             1       0.0                  179.035714  

Feature Scores (F-test):
followers_count: 0.06
friends_count: 27.95
statuses_count: 13285.75
favourites_count: 3395.44
listed_count: 962.60
verified: 31.19
followers_to_friends_ratio: 4.03


In [None]:

# TF IDF vectorization with metadata for SVM

X_meta_selected = X_meta[[
    "statuses_count",
    "favourites_count",
    "listed_count",
    "verified",
    "friends_count",
    "followers_to_friends_ratio"
]]

X = Vectorize.fit_transform(df['lemmatized_text'])
scaler = StandardScaler()
X_meta_scaled = scaler.fit_transform(X_meta_selected)

from scipy.sparse import csr_matrix, hstack

X_meta_sparse = csr_matrix(X_meta_scaled)
X_combined = hstack([X, X_meta_sparse])


In [54]:

X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, stratify=y, random_state=42
)


model = LinearSVC(class_weight='balanced', max_iter=50000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred))


Accuracy: 0.7388245166704321
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76     16535
           1       0.73      0.69      0.71     14448

    accuracy                           0.74     30983
   macro avg       0.74      0.74      0.74     30983
weighted avg       0.74      0.74      0.74     30983



In [38]:
# Model design
svm = LinearSVC(class_weight="balanced", max_iter=5000)

# Grid of C values (inverse of regularization strength)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Grid search
grid = GridSearchCV(
    svm,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit on train data
grid.fit(X_train, y_train)

# Predict and evaluate
y_pred = grid.predict(X_val)

print("Best C:", grid.best_params_["C"])
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C: 0.1

Validation Accuracy: 0.7759416454184552

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.91      0.81     16535
           1       0.85      0.63      0.72     14448

    accuracy                           0.78     30983
   macro avg       0.79      0.77      0.77     30983
weighted avg       0.79      0.78      0.77     30983



In [51]:
# Define model
model = SGDClassifier(class_weight='balanced', max_iter=1000)

# Hyperparameter grid
param_grid = {
    'loss': ['hinge', 'squared_hinge'],
    'alpha': [1e-3, 1e-4, 1e-5]  # Note: SGDClassifier uses 'alpha' = 1/C
}

grid = GridSearchCV(
    model,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit to training set
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)

# Evaluate
y_pred = grid.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Params: {'alpha': 1e-05, 'loss': 'hinge'}

Validation Accuracy: 0.7779427427944356

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.91      0.81     16535
           1       0.86      0.63      0.72     14448

    accuracy                           0.78     30983
   macro avg       0.80      0.77      0.77     30983
weighted avg       0.79      0.78      0.77     30983



In [55]:
def text_to_vector(text, model, dim=300):
    tokens = simple_preprocess(text, deacc=True)  # tokenize & clean
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

# Apply to all tweets
X_fasttext = np.vstack(df["lemmatized_text"].apply(lambda x: text_to_vector(x, ft_model)))
print("FastText shape:", X_fasttext.shape)


# Scale metadata
scaler = StandardScaler()
X_meta_scaled = scaler.fit_transform(X_meta_selected)

# Combine (both are dense)
X_combined = np.hstack([X_fasttext, X_meta_scaled])


X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, stratify=y, random_state=42
)

model = LinearSVC(class_weight="balanced", max_iter=5000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("‚úÖ Final FastText + Metadata Accuracy:", accuracy_score(y_val, y_pred))
print("üìä Final Classification Report:\n")
print(classification_report(y_val, y_pred))


FastText shape: (154914, 300)
‚úÖ Final FastText + Metadata Accuracy: 0.7757802665978117
üìä Final Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.90      0.81     16535
           1       0.85      0.63      0.72     14448

    accuracy                           0.78     30983
   macro avg       0.79      0.77      0.77     30983
weighted avg       0.79      0.78      0.77     30983



In [64]:
# Best model so far: SDGClassifier Best Params: {'alpha': 1e-05, 'loss': 'hinge'}


# Final Run:

# ---------- Load Test Set ----------
test_df = pd.read_json("../data/Kaggle2025/kaggle_test.jsonl", lines=True)
test_df = pd.json_normalize(test_df.to_dict(orient='records'))
print("Test shape:", test_df.shape)

# ---------- Reuse Cleaning Functions ----------
def extract_text(row):
    if pd.notna(row.get("extended_tweet.full_text")):
        return row["extended_tweet.full_text"]
    elif pd.notna(row.get("quoted_status.extended_tweet.full_text")):
        return row["quoted_status.extended_tweet.full_text"]
    elif pd.notna(row.get("quoted_status.text")):
        return row["quoted_status.text"]
    else:
        return row.get("text", "")

def clean_french_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#(\w+)", r" \1 ", text)
    text = re.sub(r"[^\w\s√Ä-√ø]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ---------- Clean Text ----------
test_df["clean_text"] = test_df.apply(extract_text, axis=1).apply(clean_french_tweet)

# ---------- FastText Vectorization ----------
def text_to_vector(text, model, dim=300):
    tokens = simple_preprocess(text, deacc=True)
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

X_fasttext_test = np.vstack(test_df["clean_text"].apply(lambda x: text_to_vector(x, ft_model)))
print("FastText vector shape:", X_fasttext_test.shape)

# ---------- Metadata Extraction ----------

def extract_metadata(df):
    meta = pd.DataFrame()
    def get_first(row, keys): return next((row[k] for k in keys if k in row and pd.notna(row[k])), np.nan)

    meta["followers_count"] = df.get("user.followers_count", df["quoted_status.user.followers_count"])
    meta["friends_count"] = df.get("user.friends_count", df["quoted_status.user.friends_count"])
    meta["statuses_count"] = df.get("user.statuses_count", df["quoted_status.user.statuses_count"])
    meta["favourites_count"] = df.get("user.favourites_count", df["quoted_status.user.favourites_count"])
    meta["listed_count"] = df.get("user.listed_count", df["quoted_status.user.listed_count"])
    meta["verified"] = df.get("user.verified", df["quoted_status.user.verified"]).astype(float)
    meta["followers_to_friends_ratio"] = meta["followers_count"] / (meta["friends_count"] + 1)

    meta = meta.fillna(meta.median())
    return meta

X_meta_test = extract_metadata(test_df)

# ---------- Select Same Features as Training ----------
X_meta_test_selected = X_meta_test[[
    "statuses_count",
    "favourites_count",
    "listed_count",
    "verified",
    "friends_count",
    "followers_to_friends_ratio"
]]

# ---------- Scale and Combine ----------
X_meta_scaled_test = scaler.transform(X_meta_test_selected)
X_test_combined = np.hstack([X_fasttext_test, X_meta_scaled_test])

# ---------- Train Final Model on ALL Data ----------
X_all = np.hstack([X_fasttext, scaler.transform(X_meta_selected)])
model_final = LinearSVC(class_weight="balanced", max_iter=5000)
model_final.fit(X_all, y)

# ---------- Predict and Export ----------
y_kaggle_pred = model_final.predict(X_test_combined)

kaggle_id = test_df['challenge_id']
submission = pd.DataFrame({
    "ID": kaggle_id.values,
    "Prediction": y_kaggle_pred
})


submission.to_csv("submission.csv", index=False)
print("‚úÖ Submission file saved as submission.csv")


Test shape: (103380, 191)
FastText vector shape: (103380, 300)
‚úÖ Submission file saved as submission.csv
