# Preprocessing

In [1]:
import re
import string
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from tqdm import tqdm
tqdm.pandas(desc='Preprocessing')
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

In [None]:
# Download some NLP models for processing
nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


def preprocess_tweet(tweet):
    # Lowercasing
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\.\S+', '', tweet)
    
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove hashtags (keep the text after the #)
    tweet = re.sub(r'#(\w+)', r'\1', tweet)
    
    # Remove special characters, punctuation, and numbers
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    
    # Tokenization
    words = tweet.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Handle emojis (optional: convert to text or remove)
    tweet = emoji.demojize(' '.join(words))  # Converts emojis to text, e.g., ":smile:"
    
    # Final cleanup: remove redundant spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    
    return tweet

In [None]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("train_tweets"):
    df = pd.read_csv("train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

# Entrainer sur un petit échantillon
# df = df.sample(n=1000000, random_state=42)

# Apply preprocessing to each tweet
df['Tweet'] = df['Tweet'].progress_apply(preprocess_tweet)

# Add a feature for sentiment using TextBlob
df['sentiment'] = df['Tweet'].progress_apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
df.to_csv('data_preprocess.csv')

# Concaténation

In [16]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [21]:
# Function to subdivide data into 20 intervals and concatenate embeddings
def create_subdivisions_with_concatenation(df, num_subdivisions=20, embeddings_model=None):
    subdivided_data = []
    
    # Convertir la colonne Tweet en chaînes pour éviter les erreurs
    df['Tweet'] = df['Tweet'].astype(str)  # Convertir toutes les valeurs en chaînes

    # Group by MatchID and PeriodID
    for (match_id, period_id), group in df.groupby(['MatchID', 'PeriodID']):
        tweets_per_period = len(group)
        subdivision_size = tweets_per_period // num_subdivisions
        
        # Placeholder for concatenated embeddings
        concatenated_embeddings = []
        
        for i in range(num_subdivisions):
            # Get the subset of tweets for this subdivision
            start_idx = i * subdivision_size
            end_idx = (i + 1) * subdivision_size if (i + 1) * subdivision_size <= tweets_per_period else tweets_per_period
            
            if start_idx >= end_idx:  # Handle edge cases
                continue
            
            subdivision_tweets = group.iloc[start_idx:end_idx]
            
            # Compute average embedding for this subdivision
            embeddings = []
            for tweet in subdivision_tweets['Tweet']:
                try:
                    # Replace this with your actual embedding model logic
                    embeddings.append(get_avg_embedding(tweet, embeddings_model))
                except Exception as e:
                    print(f"Error processing tweet: {tweet} | Error: {e}")
                    continue
            
            if embeddings:
                avg_embedding = np.mean(np.vstack(embeddings), axis=0)
                concatenated_embeddings.append(avg_embedding)
        
        # Flatten concatenated embeddings into a single vector
        if concatenated_embeddings:
            flattened_embeddings = np.concatenate(concatenated_embeddings)
            
            subdivided_data.append({
                'MatchID': match_id,
                'PeriodID': period_id,
                'ID': group['ID'].iloc[0],  # Keep the first ID
                'ConcatenatedEmbeddings': flattened_embeddings,
                'EventType': group['EventType'].iloc[0]  # Assuming same EventType for the whole period
            })
    
    return pd.DataFrame(subdivided_data)

df = pd.read_csv('data_preprocess.csv')

df_subdivided = create_subdivisions_with_concatenation(df, num_subdivisions=20, embeddings_model=embeddings_model)

In [19]:
df_subdivided.to_csv('data_subdivisee.csv')

# Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def train_random_forest_classifier_with_grid_search(df):
    # Prepare features
    embeddings = np.vstack(df['ConcatenatedEmbeddings'].values)
    period_ids = df['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation
    
    # Concatenate PeriodID as an additional feature
    X = np.hstack([embeddings, period_ids])  # X now has shape (n_samples, 4001)
    
    # Prepare labels
    y = df['EventType']
    
    # Encode labels if necessary
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    # Define the Random Forest Classifier
    clf = RandomForestClassifier(random_state=42)
    
    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 150],  # Réduire les valeurs testées
        'max_depth': [20, None], 
        'min_samples_split': [2, 5],
        'min_samples_leaf': [3, 5],
        'max_features': ['sqrt']
    }

    
    # Set up the GridSearchCV
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        verbose=2,  # Increase output verbosity for debugging
        n_jobs=-1   # Use all available processors
    )
    
    # Perform the grid search
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_clf = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Evaluate the classifier with the best parameters
    y_pred = best_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Optimized Random Forest Classifier Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return best_clf, label_encoder, grid_search.best_params_

# Step 2: Train the Random Forest Classifier with Grid Search
# df_subdivided = pd.read_csv('data_subdivisee.csv')

rf, label_encoder, best_params = train_random_forest_classifier_with_grid_search(df_subdivided)

In [None]:
# Fonction pour subdiviser et concaténer les embeddings dans le jeu de données d'évaluation
eval_df = pd.read_csv('eval_preprocess.csv')
def prepare_eval_data_with_concatenation(eval_df, num_subdivisions=20, embeddings_model=None):
    prepared_data = []
    
    # Convertir la colonne Tweet en chaînes pour éviter les erreurs
    eval_df['Tweet'] = eval_df['Tweet'].astype(str)
    
    for (match_id, period_id), group in eval_df.groupby(['MatchID', 'PeriodID']):
        tweets_per_period = len(group)
        subdivision_size = tweets_per_period // num_subdivisions
        
        concatenated_embeddings = []
        
        for i in range(num_subdivisions):
            # Get the subset of tweets for this subdivision
            start_idx = i * subdivision_size
            end_idx = (i + 1) * subdivision_size if (i + 1) * subdivision_size <= tweets_per_period else tweets_per_period
            
            if start_idx >= end_idx:
                continue
            
            subdivision_tweets = group.iloc[start_idx:end_idx]
            
            # Compute average embedding for this subdivision
            embeddings = []
            for tweet in subdivision_tweets['Tweet']:
                try:
                    embeddings.append(get_avg_embedding(tweet, embeddings_model))
                except Exception as e:
                    print(f"Error processing tweet: {tweet} | Error: {e}")
                    continue
            
            if embeddings:
                avg_embedding = np.mean(np.vstack(embeddings), axis=0)
                concatenated_embeddings.append(avg_embedding)
        
        # Flatten concatenated embeddings into a single vector
        if concatenated_embeddings:
            flattened_embeddings = np.concatenate(concatenated_embeddings)
            
            prepared_data.append({
                'ID': group['ID'].iloc[0],  # Keep the first ID
                'ConcatenatedEmbeddings': flattened_embeddings,
                'PeriodID': period_id
            })
    
    return pd.DataFrame(prepared_data)

eval_prepared = prepare_eval_data_with_concatenation(eval_df, num_subdivisions=20, embeddings_model=embeddings_model)

# Prepare features
# Stack embeddings and add PeriodID as an additional feature
embeddings = np.vstack(eval_prepared['ConcatenatedEmbeddings'].values)
period_ids = eval_prepared['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation

# Concatenate PeriodID as an additional feature
X_eval = np.hstack([embeddings, period_ids])
predicted_labels_rf = rf.predict(X_eval)

# Ajouter les prédictions au DataFrame
eval_prepared['EventType'] = predicted_labels_rf

# Extraire les colonnes ID et EventType pour l'export
result_df = eval_prepared[['ID', 'EventType']]

# Exporter le résultat au format CSV
result_df.to_csv('eval_predictions_rf.csv', index=False)

print("Les prédictions ont été enregistrées dans 'eval_predictions.csv'.")

# MLP

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

def train_mlp_classifier_with_grid_search(df):
    # Prepare features
    embeddings = np.vstack(df['ConcatenatedEmbeddings'].values)
    period_ids = df['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation
    
    # Concatenate PeriodID as an additional feature
    X = np.hstack([embeddings, period_ids])  # X now has shape (n_samples, 4001)
    
    # Standardize the features (important for MLPs)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Prepare labels
    y = df['EventType']
    
    # Encode labels if necessary
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
    
    # Define the MLP Classifier
    clf = MLPClassifier(random_state=42, max_iter=500)
    
    # Define the parameter grid for GridSearchCV
    param_grid = {
        'hidden_layer_sizes': [(2048, 512, 128, 64), (4000, 1000, 500, 200)],  # Different layer configurations
        'activation': ['relu'],                   # Activation functions
        'solver': ['adam'],                        # Solvers
        # 'learning_rate_init': [0.001, 0.01],              # Initial learning rates
        # 'alpha': [0.0001, 0.001],                         # L2 regularization parameter
    }
    
    # Set up the GridSearchCV
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        verbose=2,  # Increase output verbosity for debugging
        n_jobs=-1   # Use all available processors
    )
    
    # Perform the grid search
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_clf = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Evaluate the classifier with the best parameters
    y_pred = best_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Optimized MLP Classifier Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return best_clf, label_encoder, scaler, grid_search.best_params_

# Example usage:
# df_subdivided = pd.read_csv('data_subdivisee.csv')
mlp, label_encoder, scaler, best_params = train_mlp_classifier_with_grid_search(df_subdivided)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (4000, 1000, 500, 200), 'solver': 'adam'}
Optimized MLP Classifier Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [41]:
# Prepare features for evaluation
embeddings = np.vstack(eval_prepared['ConcatenatedEmbeddings'].values)
period_ids = eval_prepared['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation

# Concatenate PeriodID as an additional feature
X_eval = np.hstack([embeddings, period_ids])

# Standardize the evaluation data using the same scaler as for training
# X_eval = scaler.transform(X_eval)  # Ensure the data is normalized similarly to training

# Predict labels using the trained MLP model
predicted_labels_encoded = mlp.predict(X_eval)

# Decode the predicted labels back to original class names
predicted_labels_mlp = label_encoder.inverse_transform(predicted_labels_encoded)

# Ajouter les prédictions au DataFrame
eval_prepared['EventType'] = predicted_labels_mlp

# Extraire les colonnes ID et EventType pour l'export
result_df = eval_prepared[['ID', 'EventType']]

# Exporter le résultat au format CSV
result_df.to_csv('eval_predictions_mlp.csv', index=False)

print("Les prédictions ont été enregistrées dans 'eval_predictions_mlp.csv'.")


Les prédictions ont été enregistrées dans 'eval_predictions_mlp.csv'.
