In [None]:
import pandas as pd
import re
import numpy as np
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
def clean_tweet(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', str(text))
    text = re.sub(r'https?://\S+', '', text)
    text = text.lower().strip()
    return text

def custom_tokenizer(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    tokens = re.findall(r'\w{2,}|\:\w+\:', text)
    return tokens


In [None]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
val_df = pd.read_csv('/content/validation.csv')


In [None]:
for df in [train_df, test_df, val_df]:
    df['label'] = df['label'].astype(int)
    df['clean_tweet'] = df['tweet'].apply(clean_tweet)


In [None]:
X_train_text = train_df['clean_tweet']
y_train = train_df['label']
X_test_text = test_df['clean_tweet']
y_test = test_df['label']
X_val_text = val_df['clean_tweet']
y_val = val_df['label']


In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights_dict = dict(zip(np.unique(y_train), class_weights))


In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=custom_tokenizer, lowercase=True, max_features=5000, stop_words='english')),
    ('svd', TruncatedSVD(n_iter=7, random_state=42)),
    ('svc', SVC(class_weight=weights_dict, kernel='rbf', random_state=42))
])

param_grid = {
    'svd__n_components': [100, 200, 300],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 0.01, 0.001]
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid.fit(X_train_text, y_train)


In [None]:
best_model = grid.best_estimator_
y_train_pred = best_model.predict(X_train_text)
y_test_pred = best_model.predict(X_test_text)
y_val_pred = best_model.predict(X_val_text)

print("✅ Meilleurs paramètres :", grid.best_params_)
print(f"✅ Accuracy (Train) : {accuracy_score(y_train, y_train_pred):.4f}")
print(f"✅ Accuracy (Test)  : {accuracy_score(y_test, y_test_pred):.4f}")
print(f"✅ Accuracy (Val)   : {accuracy_score(y_val, y_val_pred):.4f}")


In [None]:
target_names = ['positive', 'neutral', 'negative']
print("\n📊 Rapport (Test) :")
print(classification_report(y_test, y_test_pred, target_names=target_names))


In [None]:
cm = confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2])
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names,
            yticklabels=target_names)
plt.title('Matrice de confusion - Test set')
plt.xlabel('Prédit')
plt.ylabel('Réel')
plt.show()


In [None]:
inv_label = {0: 'positive', 1: 'neutral', 2: 'negative'}
for idx in np.random.choice(len(test_df), 5, replace=False):
    print("Tweet :", test_df['tweet'].iloc[idx])
    print("Vrai label :", inv_label[y_test.iloc[idx]], "| Prédit :", inv_label[y_test_pred[idx]])
    print("-" * 50)


In [None]:
# 1. Sans emojis
def clean_no_emoji(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', str(text))
    text = re.sub(r'https?://\S+', '', text)
    return text.lower().strip()

X_train_no_emoji = train_df['tweet'].apply(clean_no_emoji)
X_test_no_emoji = test_df['tweet'].apply(clean_no_emoji)
X_val_no_emoji = val_df['tweet'].apply(clean_no_emoji)

# 2. Pipeline ablatif
ablation_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, max_features=5000, stop_words='english')),  # sans tokenizer personnalisé
    ('svd', TruncatedSVD(n_iter=7, random_state=42)),
    ('svc', SVC(kernel='rbf', random_state=42))  # sans class_weight
])

ablation_pipeline.fit(X_train_no_emoji, y_train)
y_pred_ablation = ablation_pipeline.predict(X_test_no_emoji)

print("\n🔬 Rapport d’ablation (sans emoji + sans pondération) :")
print(classification_report(y_test, y_pred_ablation, target_names=target_names))
print(f"Accuracy : {accuracy_score(y_test, y_pred_ablation):.4f}")
