In [None]:
!pip install emoji


In [None]:
import pandas as pd
import re
import numpy as np
import emoji
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.utils.class_weight import compute_class_weight


In [None]:
def clean_tweet(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', str(text))
    text = re.sub(r'https?://\S+', '', text)
    text = text.lower().strip()
    return emoji.demojize(text, delimiters=(" ", " "))


In [None]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
val_df = pd.read_csv('/content/validation.csv')


In [None]:
for df in [train_df, test_df, val_df]:
    df['label'] = df['label'].astype(int)
    df['clean_tweet'] = df['tweet'].apply(clean_tweet)


In [None]:
X_train_text = train_df['clean_tweet']
y_train = train_df['label']
X_test_text = test_df['clean_tweet']
y_test = test_df['label']
X_val_text = val_df['clean_tweet']
y_val = val_df['label']


In [None]:
max_words = 5000
max_len = 50
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(X_train_text)

X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=max_len)
X_test_pad = pad_sequences(tokenizer.texts_to_sequences(X_test_text), maxlen=max_len)
X_val_pad = pad_sequences(tokenizer.texts_to_sequences(X_val_text), maxlen=max_len)


In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights_dict = dict(zip(np.unique(y_train), class_weights))


In [None]:
embedding_dim = 100
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=False, kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(3, activation='softmax')
])


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    class_weight=weights_dict,
    callbacks=[lr_scheduler, early_stopping],
    verbose=2
)


In [None]:
y_train_pred = np.argmax(model.predict(X_train_pad), axis=1)
y_test_pred = np.argmax(model.predict(X_test_pad), axis=1)
y_val_pred = np.argmax(model.predict(X_val_pad), axis=1)

print(f"Train: {accuracy_score(y_train, y_train_pred):.4f}")
print(f"Test : {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Val  : {accuracy_score(y_val, y_val_pred):.4f}")


In [None]:
target_names = ['positive', 'neutral', 'negative']
print(classification_report(y_test, y_test_pred, target_names=target_names))


In [None]:
cm = confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2])
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names,
            yticklabels=target_names)
plt.title('Matrice de confusion - Test set')
plt.xlabel('Prédit')
plt.ylabel('Réel')
plt.show()


In [None]:
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
inv_label = {0: 'positive', 1: 'neutral', 2: 'negative'}
for idx in np.random.choice(len(test_df), 5, replace=False):
    print("Tweet :", test_df['tweet'].iloc[idx])
    print("Vrai label :", inv_label[y_test.iloc[idx]], "| Prédit :", inv_label[y_test_pred[idx]])
    print("-" * 50)


In [None]:
model_no_dropout = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=False, kernel_regularizer=l2(0.01)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(3, activation='softmax')
])

model_no_dropout.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history_no_dropout = model_no_dropout.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    class_weight=weights_dict,
    callbacks=[lr_scheduler, early_stopping],
    verbose=2
)

# Évaluer sur test
y_pred_no_dropout = np.argmax(model_no_dropout.predict(X_test_pad), axis=1)
print("Test accuracy sans Dropout :", accuracy_score(y_test, y_pred_no_dropout))


In [None]:
model_no_l2 = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model_no_l2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history_no_l2 = model_no_l2.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    class_weight=weights_dict,
    callbacks=[lr_scheduler, early_stopping],
    verbose=2
)

# Évaluer sur test
y_pred_no_l2 = np.argmax(model_no_l2.predict(X_test_pad), axis=1)
print("Test accuracy sans L2 :", accuracy_score(y_test, y_pred_no_l2))


In [None]:
model_no_weight = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=False, kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model_no_weight.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history_no_weight = model_no_weight.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val_pad, y_val),
    callbacks=[lr_scheduler, early_stopping],  # sans class_weight
    verbose=2
)

# Évaluer sur test
y_pred_no_weight = np.argmax(model_no_weight.predict(X_test_pad), axis=1)
print("Test accuracy sans class_weight :", accuracy_score(y_test, y_pred_no_weight))
