# IMPORT :

In [1]:
import sys
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from symspellpy import SymSpell, Verbosity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import StackingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# UTILS FUNCTIONS :

In [None]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


# Basic preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# PREPROCCESS PART 0 :

In [2]:
print("PREPROCESS PART 0...")
sys.stdout.flush()


os.makedirs("tmp/", exist_ok = True)


# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings


print("PREPROCESS PART 0 : OK")
sys.stdout.flush()

PREPROCESS PART 0...


[nltk_data] Downloading package stopwords to /home/hbar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hbar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


PREPROCESS PART 0 : OK


# PREPROCESS PART 1 :

In [None]:
print("PREPROCESS PART 1...")
sys.stdout.flush()


if not os.path.isfile("tmp/processing1.csv"):
    # Read all training files and concatenate them into one dataframe
    li = []
    for filename in os.listdir("train_tweets"):
        df = pd.read_csv("train_tweets/" + filename)
        li.append(df)
    df = pd.concat(li, ignore_index=True)

    # Apply preprocessing to each tweet
    df['Tweet'] = df['Tweet'].apply(preprocess_text)

    df.to_csv("tmp/processing1.csv", index=False, encoding="utf-8")
else:
    df = pd.read_csv("tmp/processing1.csv")


print("PREPROCESS PART 1 : OK")
sys.stdout.flush()

# PREPROCESS PART 2 :

In [None]:
print("PREPROCESS PART 2...")
sys.stdout.flush()


if not os.path.isfile("tmp/X.npy") and not os.path.isfile("tmp/y.npy"):
    # Apply preprocessing to each tweet and obtain vectors
    vector_size = 200  # Adjust based on the chosen GloVe model
    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)

    # Attach the vectors into the original dataframe
    period_features = pd.concat([df, tweet_df], axis=1)
    # Drop the columns that are not useful anymore
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    # Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

    # We drop the non-numerical features and keep the embeddings values for each period
    X = period_features.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
    # We extract the labels of our training samples
    y = period_features['EventType'].values

    np.save("tmp/X.npy", X)
    np.save("tmp/y.npy", y)
else:
    X = np.load("tmp/X.npy")
    y = np.load("tmp/y.npy")


print("PREPROCESS PART 2 : OK")
sys.stdout.flush()


# Evaluating on a test set :

In [None]:
# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

clf = RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

In [None]:
clf = XGBClassifier(random_state=42, n_estimators=100, eval_metric="logloss")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

clf = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# Combine Evaluation on a test set :

In [None]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100, eval_metric="logloss")
base_models = [ ('rf', rf), ('xgb', xgb) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

In [None]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100, eval_metric="logloss")
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

In [None]:
# Modèles de base
rf =  RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
base_models = [ ('rf', rf), ('xgb', xgb) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

In [None]:
# Modèles de base
rf =  RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

## Find the best cv value :

In [None]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100, eval_metric="logloss")
base_models = [ ('rf', rf), ('xgb', xgb) ]
    
# Méta-modèle
meta_model = LogisticRegression()

# Find best cv
max_score = 0
max_cv = 0

for i in range(2, 21):
    # Stacking
    stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=i)
    stack.fit(X_train, y_train)
    
    # Évaluation
    stack_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, stack_pred)

    print(accuracy)

    # Udapte ?
    if max_score < accuracy:
        max_score = accuracy
        max_cv = i

# Print result
print("Accuracy (Stacking):", max_score, "with cv =", max_cv)

In [None]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100, eval_metric="logloss")
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]
    
# Méta-modèle
meta_model = LogisticRegression()

# Find best cv
max_score = 0
max_cv = 0

for i in range(2, 21):
    # Stacking
    stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=i)
    stack.fit(X_train, y_train)
    
    # Évaluation
    stack_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, stack_pred)

    print(accuracy)

    # Udapte ?
    if max_score < accuracy:
        max_score = accuracy
        max_cv = i

# Print result
print("Accuracy (Stacking):", max_score, "with cv =", max_cv)

In [None]:
# Modèles de base
rf =  RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
base_models = [ ('rf', rf), ('xgb', xgb) ]
    
# Méta-modèle
meta_model = LogisticRegression()

# Find best cv
max_score = 0
max_cv = 0

for i in range(2, 21):
    # Stacking
    stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=i)
    stack.fit(X_train, y_train)
    
    # Évaluation
    stack_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, stack_pred)

    print(accuracy)

    # Udapte ?
    if max_score < accuracy:
        max_score = accuracy
        max_cv = i

# Print result
print("Accuracy (Stacking):", max_score, "with cv =", max_cv)

In [None]:
# Modèles de base
rf =  RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]
    
# Méta-modèle
meta_model = LogisticRegression()

# Find best cv
max_score = 0
max_cv = 0

for i in range(2, 21):
    # Stacking
    stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=i)
    stack.fit(X_train, y_train)
    
    # Évaluation
    stack_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, stack_pred)

    print(accuracy)

    # Udapte ?
    if max_score < accuracy:
        max_score = accuracy
        max_cv = i

# Print result
print("Accuracy (Stacking):", max_score, "with cv =", max_cv)

# For Kaggle submission :

In [None]:
print("KAGGLE...")
sys.stdout.flush()


# This time we train our classifier on the full dataset that it is available to us.

clf = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
clf.fit(X, y)
predictions = []


# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in val_df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)

    period_features = pd.concat([val_df, tweet_df], axis=1)
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    
    X_pred = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values
    preds = clf.predict(X_pred)
    
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('predictions.csv', index=False)


print("KAGGLE : OK")
sys.stdout.flush()