# Preprocessing

In [1]:
import re
import string
import emoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from tqdm import tqdm
tqdm.pandas(desc='Preprocessing')
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

In [None]:
# Download some NLP models for processing
nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


def preprocess_tweet(tweet):
    # Lowercasing
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\.\S+', '', tweet)
    
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove hashtags (keep the text after the #)
    tweet = re.sub(r'#(\w+)', r'\1', tweet)
    
    # Remove special characters, punctuation, and numbers
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    
    # Tokenization
    words = tweet.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Handle emojis (optional: convert to text or remove)
    tweet = emoji.demojize(' '.join(words))  # Converts emojis to text, e.g., ":smile:"
    
    # Final cleanup: remove redundant spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    
    return tweet

In [None]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("train_tweets"):
    df = pd.read_csv("train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

# Entrainer sur un petit échantillon
# df = df.sample(n=1000000, random_state=42)

# Apply preprocessing to each tweet
df['Tweet'] = df['Tweet'].progress_apply(preprocess_tweet)

# Add a feature for sentiment using TextBlob
df['sentiment'] = df['Tweet'].progress_apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
df.to_csv('data_preprocess.csv')

# Subdivision and training

In [3]:
df = pd.read_csv('data_preprocess.csv')

In [None]:
def create_subdivisions(df, num_subdivisions=30):
    subdivided_data = []
    
    # Iterate through each period
    for period_id, group in df.groupby(['MatchID', 'PeriodID']):
        tweets_per_period = len(group)
        subdivision_size = tweets_per_period // num_subdivisions
        
        # Process each subdivision
        for i in range(num_subdivisions):
            # Get the subset of tweets for this subdivision
            start_idx = i * subdivision_size
            end_idx = (i + 1) * subdivision_size if (i + 1) * subdivision_size <= tweets_per_period else tweets_per_period
            
            # Ensure we have data in the range (handle edge cases for small periods)
            if start_idx >= end_idx:
                continue
            
            subdivision_tweets = group.iloc[start_idx:end_idx]
            
            # Get the embeddings for the tweets in this subdivision
            try:
                embeddings = np.vstack([get_avg_embedding(tweet, embeddings_model) for tweet in subdivision_tweets['Tweet']])
                
                # Calculate the average embedding for the subdivision
                avg_embedding = np.mean(embeddings, axis=0)
                
                # Append the data for this subdivision (one sample per subdivision)
                subdivided_data.append({
                    'MatchID': group['MatchID'].iloc[0],
                    'PeriodID': group['PeriodID'].iloc[0],
                    'sentiment': subdivision_tweets['sentiment'].mean(),  # Average sentiment
                    'tweet_vectors': avg_embedding, 
                    'EventType': group['EventType'].iloc[0]
                })
            except Exception as e:
                print(e)
        
    # Convert the list of subdivided data into a DataFrame
    return pd.DataFrame(subdivided_data)

# Create subdivisions and process the data
df_subdivided = create_subdivisions(df)

# Check the size of the new dataset
print(f"Number of rows in the subdivided dataset: {len(df_subdivided)}")

# Extract tweet vectors (embeddings)
tweet_vectors = np.vstack(df_subdivided['tweet_vectors'].values)

# Extract additional features: PeriodID and sentiment
period_id_feature = df_subdivided['PeriodID'].values.reshape(-1, 1)
sentiment_feature = df_subdivided['sentiment'].values.reshape(-1, 1)

# Combine all features into a single X array
X = np.hstack([tweet_vectors, period_id_feature, sentiment_feature])

# Output the shape of the final feature array
print(f"Shape of X: {X.shape}")

y = df_subdivided['EventType'].values

# Neural network

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define the PyTorch MLP model with LeakyReLU
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(202, 200) 
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, 50)
        self.fc4 = nn.Linear(50, 25)
        self.fc5 = nn.Linear(25, 2)  # Output size based on the number of classes
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU activation
        self.softmax = nn.Softmax(dim=1)  # For multi-class classification

    def forward(self, x):
        x = self.leaky_relu(self.fc1(x))
        x = self.leaky_relu(self.fc2(x))
        x = self.leaky_relu(self.fc3(x))
        x = self.leaky_relu(self.fc4(x))
        x = self.fc5(x)
        return self.softmax(x)

# Initialize the model, loss function, and optimizer
model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Add a scheduler to adjust the learning rate
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Training loop with scheduler
epochs = 40
batch_size = 32
for epoch in range(epochs):
    model.train()
    for i in range(0, len(X_train_tensor), batch_size):
        # Batch processing
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]
        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Step the scheduler
    scheduler.step()

    # Print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        current_lr = optimizer.param_groups[0]['lr']  # Get current learning rate
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Learning Rate: {current_lr:.6f}")

# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred_prob = model(X_test_tensor)
    y_pred = torch.argmax(y_pred_prob, axis=1).numpy()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy:", accuracy)

In [None]:
################################## KAGGLE ###########################################

# Lire les fichiers CSV dans eval_tweets et concaténer dans un DataFrame
eval_data = []
for filename in os.listdir("eval_tweets"):
    filepath = os.path.join("eval_tweets", filename)
    if os.path.isfile(filepath):  # Vérifier que c'est bien un fichier
        df = pd.read_csv(filepath)
        eval_data.append(df)

eval_df = pd.concat(eval_data, ignore_index=True)

# Appliquer le prétraitement et ajouter des caractéristiques
eval_df['Tweet'] = eval_df['Tweet'].progress_apply(preprocess_tweet)
eval_df['sentiment'] = eval_df['Tweet'].progress_apply(lambda x: TextBlob(x).sentiment.polarity)

In [25]:
eval_df.to_csv('eval_preprocess.csv')

In [None]:
import torch
import torch.nn.functional as F
from collections import Counter

# Step 1: Create subdivisions for the test set
df_subdivided_test = create_subdivisions_test(eval_df)

# Step 2: Prepare the feature tensor for prediction
tweet_vectors_test = np.vstack(df_subdivided_test['tweet_vectors'].values)
period_id_feature_test = df_subdivided_test['PeriodID'].values.reshape(-1, 1)
sentiment_feature_test = df_subdivided_test['sentiment'].values.reshape(-1, 1)

# Combine features
X_test = np.hstack([tweet_vectors_test, period_id_feature_test, sentiment_feature_test])

# Convert to PyTorch tensors
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Step 3: Predict labels for each subdivision using the neural network
# Assuming `model` is your trained PyTorch neural network
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted_labels = torch.max(F.softmax(outputs, dim=1), 1)

# Add predictions to the DataFrame
df_subdivided_test['PredictedEventType'] = predicted_labels.numpy()

# Step 4: Aggregate predictions to get one label per minute
def aggregate_predictions_nn(df):
    final_predictions = []
    for (match_id, period_id), group in df.groupby(['MatchID', 'PeriodID']):
        # Use majority voting to determine the label for the minute
        predicted_labels = group['PredictedEventType']
        most_common_label = Counter(predicted_labels).most_common(1)[0][0]
        
        # Append the aggregated result
        final_predictions.append({
            'ID': group['ID'].iloc[0],  # Keep the ID of the minute
            'PredictedEventType': most_common_label
        })
    
    return pd.DataFrame(final_predictions)

# Apply aggregation
aggregated_predictions_nn = aggregate_predictions_nn(df_subdivided_test)

# Step 5: Output results
print(f"Number of aggregated predictions: {len(aggregated_predictions_nn)}")
print(aggregated_predictions_nn.head())

In [27]:
aggregated_predictions_nn.to_csv('submission.csv')

# Concaténation

In [16]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [21]:
# Function to subdivide data into 20 intervals and concatenate embeddings
def create_subdivisions_with_concatenation(df, num_subdivisions=20, embeddings_model=None):
    subdivided_data = []
    
    # Convertir la colonne Tweet en chaînes pour éviter les erreurs
    df['Tweet'] = df['Tweet'].astype(str)  # Convertir toutes les valeurs en chaînes

    # Group by MatchID and PeriodID
    for (match_id, period_id), group in df.groupby(['MatchID', 'PeriodID']):
        tweets_per_period = len(group)
        subdivision_size = tweets_per_period // num_subdivisions
        
        # Placeholder for concatenated embeddings
        concatenated_embeddings = []
        
        for i in range(num_subdivisions):
            # Get the subset of tweets for this subdivision
            start_idx = i * subdivision_size
            end_idx = (i + 1) * subdivision_size if (i + 1) * subdivision_size <= tweets_per_period else tweets_per_period
            
            if start_idx >= end_idx:  # Handle edge cases
                continue
            
            subdivision_tweets = group.iloc[start_idx:end_idx]
            
            # Compute average embedding for this subdivision
            embeddings = []
            for tweet in subdivision_tweets['Tweet']:
                try:
                    # Replace this with your actual embedding model logic
                    embeddings.append(get_avg_embedding(tweet, embeddings_model))
                except Exception as e:
                    print(f"Error processing tweet: {tweet} | Error: {e}")
                    continue
            
            if embeddings:
                avg_embedding = np.mean(np.vstack(embeddings), axis=0)
                concatenated_embeddings.append(avg_embedding)
        
        # Flatten concatenated embeddings into a single vector
        if concatenated_embeddings:
            flattened_embeddings = np.concatenate(concatenated_embeddings)
            
            subdivided_data.append({
                'MatchID': match_id,
                'PeriodID': period_id,
                'ID': group['ID'].iloc[0],  # Keep the first ID
                'ConcatenatedEmbeddings': flattened_embeddings,
                'EventType': group['EventType'].iloc[0]  # Assuming same EventType for the whole period
            })
    
    return pd.DataFrame(subdivided_data)

df = pd.read_csv('data_preprocess.csv')

df_subdivided = create_subdivisions_with_concatenation(df, num_subdivisions=20, embeddings_model=embeddings_model)

In [19]:
df_subdivided.to_csv('data_subdivisee.csv')

# Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def train_random_forest_classifier_with_grid_search(df):
    # Prepare features
    embeddings = np.vstack(df['ConcatenatedEmbeddings'].values)
    period_ids = df['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation
    
    # Concatenate PeriodID as an additional feature
    X = np.hstack([embeddings, period_ids])  # X now has shape (n_samples, 4001)
    
    # Prepare labels
    y = df['EventType']
    
    # Encode labels if necessary
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    # Define the Random Forest Classifier
    clf = RandomForestClassifier(random_state=42)
    
    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 150],  # Réduire les valeurs testées
        'max_depth': [20, None], 
        'min_samples_split': [2, 5],
        'min_samples_leaf': [3, 5],
        'max_features': ['sqrt']
    }

    
    # Set up the GridSearchCV
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        verbose=2,  # Increase output verbosity for debugging
        n_jobs=-1   # Use all available processors
    )
    
    # Perform the grid search
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_clf = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Evaluate the classifier with the best parameters
    y_pred = best_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Optimized Random Forest Classifier Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return best_clf, label_encoder, grid_search.best_params_

# Step 2: Train the Random Forest Classifier with Grid Search
# df_subdivided = pd.read_csv('data_subdivisee.csv')

rf, label_encoder, best_params = train_random_forest_classifier_with_grid_search(df_subdivided)

In [None]:
# Fonction pour subdiviser et concaténer les embeddings dans le jeu de données d'évaluation
eval_df = pd.read_csv('eval_preprocess.csv')
def prepare_eval_data_with_concatenation(eval_df, num_subdivisions=20, embeddings_model=None):
    prepared_data = []
    
    # Convertir la colonne Tweet en chaînes pour éviter les erreurs
    eval_df['Tweet'] = eval_df['Tweet'].astype(str)
    
    for (match_id, period_id), group in eval_df.groupby(['MatchID', 'PeriodID']):
        tweets_per_period = len(group)
        subdivision_size = tweets_per_period // num_subdivisions
        
        concatenated_embeddings = []
        
        for i in range(num_subdivisions):
            # Get the subset of tweets for this subdivision
            start_idx = i * subdivision_size
            end_idx = (i + 1) * subdivision_size if (i + 1) * subdivision_size <= tweets_per_period else tweets_per_period
            
            if start_idx >= end_idx:
                continue
            
            subdivision_tweets = group.iloc[start_idx:end_idx]
            
            # Compute average embedding for this subdivision
            embeddings = []
            for tweet in subdivision_tweets['Tweet']:
                try:
                    embeddings.append(get_avg_embedding(tweet, embeddings_model))
                except Exception as e:
                    print(f"Error processing tweet: {tweet} | Error: {e}")
                    continue
            
            if embeddings:
                avg_embedding = np.mean(np.vstack(embeddings), axis=0)
                concatenated_embeddings.append(avg_embedding)
        
        # Flatten concatenated embeddings into a single vector
        if concatenated_embeddings:
            flattened_embeddings = np.concatenate(concatenated_embeddings)
            
            prepared_data.append({
                'ID': group['ID'].iloc[0],  # Keep the first ID
                'ConcatenatedEmbeddings': flattened_embeddings,
                'PeriodID': period_id
            })
    
    return pd.DataFrame(prepared_data)

eval_prepared = prepare_eval_data_with_concatenation(eval_df, num_subdivisions=20, embeddings_model=embeddings_model)

# Prepare features
# Stack embeddings and add PeriodID as an additional feature
embeddings = np.vstack(eval_prepared['ConcatenatedEmbeddings'].values)
period_ids = eval_prepared['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation

# Concatenate PeriodID as an additional feature
X_eval = np.hstack([embeddings, period_ids])
predicted_labels_rf = rf.predict(X_eval)

# Ajouter les prédictions au DataFrame
eval_prepared['EventType'] = predicted_labels_rf

# Extraire les colonnes ID et EventType pour l'export
result_df = eval_prepared[['ID', 'EventType']]

# Exporter le résultat au format CSV
result_df.to_csv('eval_predictions_rf.csv', index=False)

print("Les prédictions ont été enregistrées dans 'eval_predictions.csv'.")

# MLP

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

def train_mlp_classifier_with_grid_search(df):
    # Prepare features
    embeddings = np.vstack(df['ConcatenatedEmbeddings'].values)
    period_ids = df['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation
    
    # Concatenate PeriodID as an additional feature
    X = np.hstack([embeddings, period_ids])  # X now has shape (n_samples, 4001)
    
    # Standardize the features (important for MLPs)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Prepare labels
    y = df['EventType']
    
    # Encode labels if necessary
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
    
    # Define the MLP Classifier
    clf = MLPClassifier(random_state=42, max_iter=500)
    
    # Define the parameter grid for GridSearchCV
    param_grid = {
        'hidden_layer_sizes': [(2048, 512, 128, 64), (4000, 1000, 500, 200)],  # Different layer configurations
        'activation': ['relu'],                   # Activation functions
        'solver': ['adam'],                        # Solvers
        # 'learning_rate_init': [0.001, 0.01],              # Initial learning rates
        # 'alpha': [0.0001, 0.001],                         # L2 regularization parameter
    }
    
    # Set up the GridSearchCV
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,  # 5-fold cross-validation
        verbose=2,  # Increase output verbosity for debugging
        n_jobs=-1   # Use all available processors
    )
    
    # Perform the grid search
    grid_search.fit(X_train, y_train)
    
    # Get the best estimator
    best_clf = grid_search.best_estimator_
    print(f"Best Parameters: {grid_search.best_params_}")
    
    # Evaluate the classifier with the best parameters
    y_pred = best_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Optimized MLP Classifier Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return best_clf, label_encoder, scaler, grid_search.best_params_

# Example usage:
# df_subdivided = pd.read_csv('data_subdivisee.csv')
mlp, label_encoder, scaler, best_params = train_mlp_classifier_with_grid_search(df_subdivided)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Parameters: {'activation': 'relu', 'hidden_layer_sizes': (4000, 1000, 500, 200), 'solver': 'adam'}
Optimized MLP Classifier Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [41]:
# Prepare features for evaluation
embeddings = np.vstack(eval_prepared['ConcatenatedEmbeddings'].values)
period_ids = eval_prepared['PeriodID'].values.reshape(-1, 1)  # Reshape to match dimensions for concatenation

# Concatenate PeriodID as an additional feature
X_eval = np.hstack([embeddings, period_ids])

# Standardize the evaluation data using the same scaler as for training
# X_eval = scaler.transform(X_eval)  # Ensure the data is normalized similarly to training

# Predict labels using the trained MLP model
predicted_labels_encoded = mlp.predict(X_eval)

# Decode the predicted labels back to original class names
predicted_labels_mlp = label_encoder.inverse_transform(predicted_labels_encoded)

# Ajouter les prédictions au DataFrame
eval_prepared['EventType'] = predicted_labels_mlp

# Extraire les colonnes ID et EventType pour l'export
result_df = eval_prepared[['ID', 'EventType']]

# Exporter le résultat au format CSV
result_df.to_csv('eval_predictions_mlp.csv', index=False)

print("Les prédictions ont été enregistrées dans 'eval_predictions_mlp.csv'.")


Les prédictions ont été enregistrées dans 'eval_predictions_mlp.csv'.
