# IMPORT :

In [None]:
import sys
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
import h5py
import torch
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from symspellpy import SymSpell, Verbosity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# UTILS FUNCTIONS :

In [None]:
def get_embeddings_from_dataframe(df, column_name='Tweet', model_name='bert-base-uncased', batch_size=1024):
    """
    Calcule les embeddings pour une colonne de tweets dans un DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame contenant les tweets.
        column_name (str): Nom de la colonne contenant les tweets.
        model_name (str): Nom du modèle Transformer.
        batch_size (int): Taille des lots pour le traitement en batch.
    
    Returns:
        np.ndarray: Tableau 2D contenant les embeddings des tweets.
    """
    # Charger le tokenizer et le modèle
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()  # Mode évaluation

    # Si un GPU est disponible, utiliser le GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Préparer les données
    tweets = df[column_name].tolist()
    embeddings = []

    # Traiter les tweets par lots
    for i in tqdm(range(0, len(tweets), batch_size), desc="Processing tweets"):
        batch_tweets = tweets[i:i + batch_size]

        # Tokenisation et transfert sur le GPU
        inputs = tokenizer(batch_tweets, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Passage au modèle
        with torch.no_grad():
            outputs = model(**inputs)

        # Extraire les vecteurs CLS
        cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        embeddings.append(cls_embeddings)

    # Combiner tous les embeddings
    embeddings = np.vstack(embeddings)
    return embeddings


In [None]:
def get_transformer_embedding(tweet, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Tokenisation et passage au modèle
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    
    # Utiliser le vecteur CLS (représente le tweet entier)
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return cls_embedding.squeeze()

# Function to compute the average word vector for a tweet
# def get_avg_embedding(tweet, model, vector_size=200):
#     words = tweet.split()  # Tokenize by whitespace
#     word_vectors = [model[word] for word in words if word in model]
#     if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
#         return np.zeros(vector_size)
#     return np.mean(word_vectors, axis=0)


sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("en-80k.txt", term_index=0, count_index=1)
# Correct word
def correct_text(text):
    sug = sym_spell.lookup(text, Verbosity.CLOSEST, max_edit_distance=2)
    if sug:
        return sug[0].term
    else:
        return text


# Basic preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [correct_text(word) for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


analyzer = SentimentIntensityAnalyzer()
# Calculate sentiment rate of a text
def get_sentiment_rate(text):
    scores = analyzer.polarity_scores(text)
    return np.abs(scores['compound'])


football_words = ["full time", "goal", "half time", "kick off", "owngoal", "penalty", "match", "red card", "yellow card"]
# Calculate the number of football words in a tweet
def count_football_words(text):
    return sum(word in text for word in football_words)

# PREPROCCESS PART 0 :

In [None]:
print("PREPROCESS PART 0...")
sys.stdout.flush()


os.makedirs("tmp/", exist_ok = True)


# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API
# embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings


print("PREPROCESS PART 0 : OK")
sys.stdout.flush()

# PREPROCESS PART 1 :

In [None]:
print("PREPROCESS PART 1...")
sys.stdout.flush()


go = True


if go or not os.path.isfile("tmp/processing1.csv"):
    # Read all training files and concatenate them into one dataframe
    li = []
    for filename in os.listdir("train_tweets"):
        df = pd.read_csv("train_tweets/" + filename)
        li.append(df)
    df = pd.concat(li, ignore_index=True)

    # Apply preprocessing to each tweet
    df['Tweet'] = df['Tweet'].apply(preprocess_text)

    df.to_csv("tmp/processing1.csv", index=False, encoding="utf-8")
else:
    df = pd.read_csv("tmp/processing1.csv")


print("PREPROCESS PART 1 : OK")
sys.stdout.flush()

# PREPROCESS PART 2 :

In [None]:
print("PREPROCESS PART 2...")
sys.stdout.flush()


go = True


if go or not os.path.isfile("tmp/X.npy") or not os.path.isfile("tmp/y.npy"):

    limit = int(len(df) / 2)
    
    if go or not os.path.isfile("tmp/df1.npy"):
        # Calcul la 1ère partie de l'embeddings
        tweet_vectors1 = get_embeddings_from_dataframe(df[:limit], column_name='Tweet', batch_size=4096)
        tweet_df1 = pd.DataFrame(tweet_vectors1)
        np.save("tmp/df1.npy", tweet_df1)
    else:
        tweet_df1 = np.load("tmp/df1.npy")

    print("df1 : OK")
    sys.stdout.flush()
    
    if go or not os.path.isfile("tmp/df1.npy"):
        # Calcul la 2ème partie de l'embeddings
        tweet_vectors2 = get_embeddings_from_dataframe(df[limit:], column_name='Tweet', batch_size=4096)
        tweet_df2 = pd.DataFrame(tweet_vectors2)
        np.save("tmp/df2.npy", tweet_df2)
    else:
        tweet_df2 = np.load("tmp/df2.npy")

    print("df2 : OK")
    sys.stdout.flush()
    
    # Join les df
    period_features = pd.concat([tweet_df1, tweet_df2], axis=1)

    print("period_features : OK")
    sys.stdout.flush()


    # Ajouter une colonne contenant le nombre de tweets par PeriodID
    period_features['TweetCount'] = period_features.groupby(['MatchID', 'PeriodID', 'ID'])['Tweet'].transform('size').fillna(0)
    period_features['TweetCount'] = period_features['TweetCount'] / period_features['TweetCount'].max()
    
    # Ajouter une colonne contenant le nombre de mots liés au foot par tweet
    period_features['FootballWordCount'] = period_features['Tweet'].apply(count_football_words).fillna(0)
    period_features['FootballWordCount'] = period_features['FootballWordCount'] / period_features['FootballWordCount'].max()
    
    # Ajouter une colonne contenant le score de sentiment
    period_features['Sentiment'] = period_features['Tweet'].apply(get_sentiment_rate).fillna(0)

    print("add colonnes : OK")
    sys.stdout.flush()

    
    # Drop the columns that are not useful anymore
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    
    # Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    
    # We drop the non-numerical features and keep the embeddings values for each period
    X = period_features.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
    # We extract the labels of our training samples
    y = period_features['EventType'].values
    
    np.save("tmp/X.npy", X)
    np.save("tmp/y.npy", y)

    print("X-y : OK")
    sys.stdout.flush()
else:
    X = np.load("tmp/X.npy")
    y = np.load("tmp/y.npy")
    


print("PREPROCESS PART 2 : OK")
sys.stdout.flush()

# For Kaggle submission :

In [None]:
print("KAGGLE...")
sys.stdout.flush()


# This time we train our classifier on the full dataset that it is available to us.

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X, y)
predictions = []


# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    ###
    limit = int(len(val_df) / 2)
    
    tweet_vectors1 = get_embeddings_from_dataframe(val_df[:limit], column_name='Tweet', batch_size=4096)
    tweet_df1 = pd.DataFrame(tweet_vectors1)

    tweet_vectors2 = get_embeddings_from_dataframe(val_df[limit:], column_name='Tweet', batch_size=4096)
    tweet_df2 = pd.DataFrame(tweet_vectors2)

    period_features = pd.concat([tweet_df1, tweet_df2], axis=1)
    ###

    ###
    period_features['TweetCount'] = period_features.groupby(['MatchID', 'PeriodID', 'ID'])['Tweet'].transform('size').fillna(0)
    period_features['TweetCount'] = period_features['TweetCount'] / period_features['TweetCount'].max()

    period_features['FootballWordCount'] = period_features['Tweet'].apply(count_football_words).fillna(0)
    period_features['FootballWordCount'] = period_features['FootballWordCount'] / period_features['FootballWordCount'].max()

    period_features['Sentiment'] = period_features['Tweet'].apply(get_sentiment_rate).fillna(0)
    ###

    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X_pred = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X_pred)
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('predictions_rf_all.csv', index=False)


print("KAGGLE : OK")
sys.stdout.flush()

In [None]:
print("KAGGLE...")
sys.stdout.flush()


# This time we train our classifier on the full dataset that it is available to us.

clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X, y)
predictions = []


# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    ###
    limit = int(len(val_df) / 2)
    
    tweet_vectors1 = get_embeddings_from_dataframe(val_df[:limit], column_name='Tweet', batch_size=4096)
    tweet_df1 = pd.DataFrame(tweet_vectors1)

    tweet_vectors2 = get_embeddings_from_dataframe(val_df[limit:], column_name='Tweet', batch_size=4096)
    tweet_df2 = pd.DataFrame(tweet_vectors2)

    period_features = pd.concat([tweet_df1, tweet_df2], axis=1)
    ###

    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X_pred = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X_pred)
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('predictions_rf_simple.csv', index=False)


print("KAGGLE : OK")
sys.stdout.flush()

In [None]:
print("KAGGLE...")
sys.stdout.flush()


# This time we train our classifier on the full dataset that it is available to us.

clf = XGBClassifier(random_state=42, n_estimators=100)
clf.fit(X, y)
predictions = []


# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    ###
    limit = int(len(val_df) / 2)
    
    tweet_vectors1 = get_embeddings_from_dataframe(val_df[:limit], column_name='Tweet', batch_size=4096)
    tweet_df1 = pd.DataFrame(tweet_vectors1)

    tweet_vectors2 = get_embeddings_from_dataframe(val_df[limit:], column_name='Tweet', batch_size=4096)
    tweet_df2 = pd.DataFrame(tweet_vectors2)

    period_features = pd.concat([tweet_df1, tweet_df2], axis=1)
    ###

    ###
    period_features['TweetCount'] = period_features.groupby(['MatchID', 'PeriodID', 'ID'])['Tweet'].transform('size').fillna(0)
    period_features['TweetCount'] = period_features['TweetCount'] / period_features['TweetCount'].max()

    period_features['FootballWordCount'] = period_features['Tweet'].apply(count_football_words).fillna(0)
    period_features['FootballWordCount'] = period_features['FootballWordCount'] / period_features['FootballWordCount'].max()

    period_features['Sentiment'] = period_features['Tweet'].apply(get_sentiment_rate).fillna(0)
    ###

    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X_pred = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X_pred)
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('predictions_rf_all.csv', index=False)


print("KAGGLE : OK")
sys.stdout.flush()

In [None]:
print("KAGGLE...")
sys.stdout.flush()


# This time we train our classifier on the full dataset that it is available to us.

clf = XGBClassifier(random_state=42, n_estimators=100)
clf.fit(X, y)
predictions = []


# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    ###
    limit = int(len(val_df) / 2)
    
    tweet_vectors1 = get_embeddings_from_dataframe(val_df[:limit], column_name='Tweet', batch_size=4096)
    tweet_df1 = pd.DataFrame(tweet_vectors1)

    tweet_vectors2 = get_embeddings_from_dataframe(val_df[limit:], column_name='Tweet', batch_size=4096)
    tweet_df2 = pd.DataFrame(tweet_vectors2)

    period_features = pd.concat([tweet_df1, tweet_df2], axis=1)
    ###

    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X_pred = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X_pred)
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('predictions_rf_all.csv', index=False)


print("KAGGLE : OK")
sys.stdout.flush()