In [None]:
# Data handling libraries

import re
import json
import numpy as np
import pandas as pd
import stanza
from pandas import json_normalize

# Natural Language Processing (NLP) libraries
from nltk.corpus import stopwords

# Scikit-learn modeling libraries

from sklearn.svm import LinearSVC, SDGClassifier
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline 

In [6]:

path = '../data/Kaggle2025/train.jsonl'
df = pd.read_json(path, lines=True)
df = json_normalize(df.to_dict(orient='records'))

In [7]:
# !pip install stanza

# stanza.download("fr")

In [10]:
nlp = stanza.Pipeline(lang='fr', processors='tokenize,mwt,pos,lemma')


def extract_text(row):
    if pd.notna(row.get("extended_tweet.full_text")):
        return row["extended_tweet.full_text"]
    elif pd.notna(row.get("quoted_status.extended_tweet.full_text")):
        return row["quoted_status.extended_tweet.full_text"]
    elif pd.notna(row.get("quoted_status.text")):
        return row["quoted_status.text"]
    else:
        return row.get("text", "")

df["clean_text"] = df.apply(extract_text, axis=1)


def clean_french_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)               
    text = re.sub(r"@\w+", " ", text)                  
    text = re.sub(r"#(\w+)", r" \1 ", text)            
    text = re.sub(r"[^\w\s√Ä-√ø]", " ", text)            
    text = re.sub(r"\s+", " ", text).strip()           
    return text
df["clean_text"] = df['clean_text'].apply(clean_french_tweet)


def lemmatize_text(text):
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words if word.lemma is not None]
    return " ".join(lemmas)
df['lemmatized_text'] = df['clean_text'].apply(lemmatize_text)

# print("cleaned French Text Samples: ")
# print(df["clean_text"].head())

df['lemmatized_text'].head()

2025-11-17 23:29:55 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 6.35MB/s]                    
2025-11-17 23:29:55 INFO: Downloaded file to C:\Users\Asus\stanza_resources\resources.json
2025-11-17 23:29:56 INFO: Loading these models for language: fr (French):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-11-17 23:29:56 INFO: Using device: cpu
2025-11-17 23:29:56 INFO: Loading: tokenize
2025-11-17 23:29:56 INFO: Loading: mwt
2025-11-17 23:29:56 INFO: Loading: pos
2025-11-17 23:29:58 INFO: Loading: lemma
2025-11-17 23:29:58 INFO: Done loading processors!


0    direct jean castex et olivier v√©ran annoncer d...
1    direct jean castex et olivier v√©ran annoncer d...
2    on √™tre de accord pour le cons√©quence √©conomiq...
3    renforcer le capacit√© de d√©pistage et le actio...
4    on moi dire dans le oreillette que le patient ...
Name: lemmatized_text, dtype: object

In [11]:
french_stopwords = stopwords.words('french')
Vectorize = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=3,
    max_features=50000,
    stop_words=french_stopwords
)
X = Vectorize.fit_transform(df['clean_text'])
print(X.shape)
y = df['label'].values

(154914, 50000)


In [12]:
# Train-test splitting

print(len(y) == X.shape[0])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify = y, random_state = 40
)

print("Train size:", X_train.shape[0], "| Val size:", X_val.shape[0])

True
Train size: 123931 | Val size: 30983


In [20]:
# Model design
svm = LinearSVC(class_weight="balanced", max_iter=5000)

# Grid of C values (inverse of regularization strength)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Grid search
grid = GridSearchCV(
    svm,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit on train data
grid.fit(X_train, y_train)

# Predict and evaluate
y_pred = grid.predict(X_val)

print("Best C:", grid.best_params_["C"])
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best C: 10

Validation Accuracy: 0.6107865603718168

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.62      0.63     16535
           1       0.58      0.60      0.59     14448

    accuracy                           0.61     30983
   macro avg       0.61      0.61      0.61     30983
weighted avg       0.61      0.61      0.61     30983



In [19]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV


# Define model
model = SGDClassifier(class_weight='balanced', max_iter=1000)

# Hyperparameter grid
param_grid = {
    'loss': ['hinge', 'squared_hinge'],
    'alpha': [1e-3, 1e-4, 1e-5]  # Note: SGDClassifier uses 'alpha' = 1/C
}

grid = GridSearchCV(
    model,
    param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit to training set
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)

# Evaluate
y_pred = grid.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Params: {'alpha': 1e-05, 'loss': 'hinge'}

Validation Accuracy: 0.606106574573153

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.60      0.62     16535
           1       0.57      0.62      0.59     14448

    accuracy                           0.61     30983
   macro avg       0.61      0.61      0.61     30983
weighted avg       0.61      0.61      0.61     30983



In [15]:
# FAST TEXT Implementation

In [16]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---- ----------------------------------- 2.9/24.4 MB 14.0 MB/s eta 0:00:02
   --------- ------------------------------ 6.0/24.4 MB 14.8 MB/s eta 0:00:02
   --------------- ------------------------ 9.4/24.4 MB 15.1 MB/s eta 0:00:01
   -------------------- ------------------- 12.6/24.4 MB 15.2 MB/s eta 0:00:01
   --------------------------- ------------ 16.5/24.4 MB 15.5 MB/s eta 0:00:01
   --------------------------------- ------ 20.4/24.4 MB 16.1 MB/s eta 0:00:01
   ---------------------------------------  24.1/24.4 MB 16.3 MB/s eta 0:00:01
   ---------------------------------------- 24.4/24.4 MB 15.2 MB/s eta 0:00:00
Installing collected packages: gensim
Successfully installed gensim-4.4.0



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: C:\Users\Asus\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:

from gensim.models.fasttext import load_facebook_vectors
from gensim.utils import simple_preprocess
import gensim.downloader as api


# -----------------------------------------------
# Step 2: Download Pretrained FastText French Embeddings
# -----------------------------------------------
# Download from fasttext.cc (https://fasttext.cc/docs/en/crawl-vectors.html)
# French: cc.fr.300.vec.gz

import urllib.request
import gzip
import shutil
import os

fasttext_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz"
local_path = "cc.fr.300.vec.gz"
vec_file = "cc.fr.300.vec"

if not os.path.exists(vec_file):
    print("‚è≥ Downloading FastText French vectors...")
    urllib.request.urlretrieve(fasttext_url, local_path)
    print("‚úÖ Downloaded. Unzipping...")
    with gzip.open(local_path, 'rb') as f_in:
        with open(vec_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("‚úÖ Unzipped.")

# Load as KeyedVectors
from gensim.models import KeyedVectors
print("üîÅ Loading word vectors into memory...")
ft_model = KeyedVectors.load_word2vec_format(vec_file)

# -----------------------------------------------
# Step 3: Convert Each Tweet to Mean Word Vector
# -----------------------------------------------

def text_to_vector(text, model, dim=300):
    tokens = simple_preprocess(text, deacc=True)  # tokenize and remove punct
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

# Apply to all tweets
print("üîÑ Converting tweets to vectors...")
X = np.vstack(df["clean_text"].apply(lambda x: text_to_vector(x, ft_model)))
y = df["label"].values

print("‚úÖ Vector shape:", X.shape)

# -----------------------------------------------
# Step 4: Train/Validation Split
# -----------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------------------------
# Step 5: Train SVM
# -----------------------------------------------
model = LinearSVC(class_weight="balanced", max_iter=5000)
model.fit(X_train, y_train)

# -----------------------------------------------
# Step 6: Evaluate
# -----------------------------------------------
y_pred = model.predict(X_val)

print("\n‚úÖ Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nüßæ Classification Report:\n")
print(classification_report(y_val, y_pred))


‚è≥ Downloading FastText French vectors...
‚úÖ Downloaded. Unzipping...
‚úÖ Unzipped.
üîÅ Loading word vectors into memory...
üîÑ Converting tweets to vectors...
‚úÖ Vector shape: (154914, 300)

‚úÖ Validation Accuracy: 0.610076493560985

üßæ Classification Report:

              precision    recall  f1-score   support

           0       0.64      0.62      0.63     16535
           1       0.58      0.60      0.59     14448

    accuracy                           0.61     30983
   macro avg       0.61      0.61      0.61     30983
weighted avg       0.61      0.61      0.61     30983

