# IMPORT :

In [82]:
import sys
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from symspellpy import SymSpell, Verbosity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import make_classification
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb

# UTILS FUNCTIONS :

In [2]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("en-80k.txt", term_index=0, count_index=1)
# Correct word
def correct_text(text):
    sug = sym_spell.lookup(text, Verbosity.CLOSEST, max_edit_distance=2)
    if sug:
        return sug[0].term
    else:
        return text


# Basic preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [correct_text(word) for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


analyzer = SentimentIntensityAnalyzer()
# Calculate sentiment rate of a text
def get_sentiment_rate(text):
    scores = analyzer.polarity_scores(text)
    return np.abs(scores['compound'])


football_words = ["full time", "goal", "half time", "kick off", "owngoal", "penalty", "match", "red card", "yellow card"]
# Calculate the number of football words in a tweet
def count_football_words(text):
    return sum(word in text for word in football_words)

# PREPROCCESS PART 0 :

In [3]:
print("PREPROCESS PART 0...")
sys.stdout.flush()


os.makedirs("tmp/", exist_ok = True)


# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')

# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings


print("PREPROCESS PART 0 : OK")
sys.stdout.flush()

PREPROCESS PART 0...


[nltk_data] Downloading package stopwords to /home/hbar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hbar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


PREPROCESS PART 0 : OK


# PREPROCESS PART 1 :

In [4]:
print("PREPROCESS PART 1...")
sys.stdout.flush()


go = False


if go or not os.path.isfile("tmp/processing1.csv"):
    # Read all training files and concatenate them into one dataframe
    li = []
    for filename in os.listdir("train_tweets"):
        df = pd.read_csv("train_tweets/" + filename)
        li.append(df)
    df = pd.concat(li, ignore_index=True)

    # Apply preprocessing to each tweet
    df['Tweet'] = df['Tweet'].apply(preprocess_text)

    df.to_csv("tmp/processing1.csv", index=False, encoding="utf-8")
else:
    df = pd.read_csv("tmp/processing1.csv")


print("PREPROCESS PART 1 : OK")
sys.stdout.flush()

PREPROCESS PART 1...
PREPROCESS PART 1 : OK


# PREPROCESS PART 2 :

In [5]:
print("PREPROCESS PART 2...")
sys.stdout.flush()


vector_size = 200  # Adjust based on the chosen GloVe model
go = False


if go or not os.path.isfile("tmp/X.npy") and not os.path.isfile("tmp/y.npy"):
    # Apply preprocessing to each tweet and obtain vectors
    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)

    # Attach the vectors into the original dataframe
    period_features = pd.concat([df, tweet_df], axis=1)
    period_features = df

    ##
    ##

    # Ajouter une colonne contenant le nombre de tweets par PeriodID
    period_features['TweetCount'] = period_features.groupby(['MatchID', 'PeriodID', 'ID'])['Tweet'].transform('size').fillna(0)
    period_features['TweetCount'] = period_features['TweetCount'] / period_features['TweetCount'].max()

    # Ajouter une colonne contenant le nombre de mots liés au foot par tweet
    period_features['FootballWordCount'] = period_features['Tweet'].apply(count_football_words).fillna(0)
    period_features['FootballWordCount'] = period_features['FootballWordCount'] / period_features['FootballWordCount'].max()

    # Ajouter une colonne contenant le score de sentiment
    period_features['Sentiment'] = period_features['Tweet'].apply(get_sentiment_rate).fillna(0)

    ##
    ##

    # Drop the columns that are not useful anymore
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])

    # Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

    # We drop the non-numerical features and keep the embeddings values for each period
    X = period_features.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
    # We extract the labels of our training samples
    y = period_features['EventType'].values

    np.save("tmp/X.npy", X)
    np.save("tmp/y.npy", y)
else:
    X = np.load("tmp/X.npy")
    y = np.load("tmp/y.npy")


print("PREPROCESS PART 2 : OK")
sys.stdout.flush()

PREPROCESS PART 2...
PREPROCESS PART 2 : OK


# Evaluating on a test set :

In [6]:
# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# clf = LogisticRegression(random_state=42, max_iter=1000)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print("Test set: ", accuracy_score(y_test, y_pred))

In [8]:
# clf = SVC(kernel="rbf", random_state=42)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print("Test set: ", accuracy_score(y_test, y_pred))

# clf = SVC(C=0.5, kernel="poly", degree=7, random_state=42)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print("Test set: ", accuracy_score(y_test, y_pred))

In [9]:
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

clf = RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.6651090342679128
Test set:  0.6573208722741433


In [10]:
clf = XGBClassifier(random_state=42, n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

clf = XGBClassifier(random_state=42, n_estimators=170, eval_metric="logloss")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

clf = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

clf = XGBClassifier(random_state=42, n_estimators=195, learning_rate=0.2, max_depth=3, subsample=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)          
print("Test set: ", accuracy_score(y_test, y_pred))

clf = XGBClassifier(random_state=42, n_estimators=140, learning_rate=0.2, max_depth=3, subsample=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.6386292834890965
Test set:  0.6137071651090342
Test set:  0.6526479750778816
Test set:  0.6464174454828661
Test set:  0.6573208722741433


In [11]:
clf = RandomForestClassifier(random_state=42, n_estimators=205, max_depth=7, max_features='sqrt')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.6697819314641744


In [12]:
clf = XGBClassifier(random_state=42, n_estimators=80, learning_rate=0.05, max_depth=4, subsample=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)          
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.6682242990654206


# Combine Evaluation on a test set : Méthode 1

In [18]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=170, max_depth=10, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
base_models = [ ('rf', rf), ('xgb', xgb) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

Accuracy (Stacking): 0.660436137071651


In [19]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100)
base_models = [ ('rf', rf), ('xgb', xgb) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

Accuracy (Stacking): 0.6666666666666666


In [25]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100)
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

Accuracy (Stacking): 0.6853582554517134


In [20]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=205, max_depth=7, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=80, learning_rate=0.05, max_depth=4, subsample=1)
base_models = [ ('rf', rf), ('xgb', xgb) ]

# Méta-modèle
meta_model = LogisticRegression()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

Accuracy (Stacking): 0.6713395638629284


In [24]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=205, max_depth=7, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=80, learning_rate=0.05, max_depth=4, subsample=1)
base_models = [ ('rf', rf), ('xgb', xgb) ]

# Méta-modèle
meta_model = SVC()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

Accuracy (Stacking): 0.6728971962616822


In [26]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=205, max_depth=7, max_features='sqrt')
xgb = XGBClassifier(random_state=42, n_estimators=80, learning_rate=0.05, max_depth=4, subsample=1)
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]

# Méta-modèle
meta_model = SVC()

# Stacking
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stack.fit(X_train, y_train)

# Évaluation
stack_pred = stack.predict(X_test)
print("Accuracy (Stacking):", accuracy_score(y_test, stack_pred))

Accuracy (Stacking): 0.67601246105919


In [70]:
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100)
lr = LogisticRegression(random_state=42, max_iter=1000)
svc =  SVC(kernel="rbf", random_state=42)
base_models = [ ('rf', rf), ('xgb', xgb), ('lr', lr), ('svc', svc) ]
    
# Méta-modèle
meta_model = LogisticRegression()

# Find best cv
max_score = 0
max_cv = 0

for i in range(2, 21):
    # Stacking
    stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=i)
    stack.fit(X_train, y_train)
    
    # Évaluation
    stack_pred = stack.predict(X_test)
    accuracy = accuracy_score(y_test, stack_pred)

    print(accuracy)

    # Udapte ?
    if max_score < accuracy:
        max_score = accuracy
        max_cv = i

# Print result
print("Accuracy (Stacking):", max_score, "with cv =", max_cv)

0.6915887850467289
0.6962616822429907
0.6869158878504673
0.6853582554517134
0.6791277258566978
0.6853582554517134
0.6853582554517134
0.6869158878504673
0.6900311526479751
0.6838006230529595
0.6931464174454829
0.6915887850467289
0.6947040498442367
0.677570093457944
0.6915887850467289
0.6838006230529595
0.6838006230529595
0.6791277258566978
0.6853582554517134
Accuracy (Stacking): 0.6962616822429907 with cv = 3


# Combine Evaluation on a test set : Méthode 2

In [38]:
# Initialisation des modèles
# Modèles de base
rf = RandomForestClassifier(random_state=42, n_estimators=100)
xgb = XGBClassifier(random_state=42, n_estimators=100)
lr = LogisticRegression(random_state=42, max_iter=1000)
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb),
    ('lr', lr)
], voting='soft') # 'soft' pour utiliser les probabilités, 'hard' pour le vote majoritaire 

voting_clf.fit(X_train, y_train)

# Prédictions
voting_pred = voting_clf.predict(X_test)
print("Accuracy (Voting):", accuracy_score(y_test, voting_pred))

Accuracy (Voting): 0.6526479750778816


# Combine Evaluation on a test set : Méthode 3

In [59]:
# Pred des modèles
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)
rf_pred = rf.predict_proba(X_test)[:, 1]

xgb = XGBClassifier(random_state=42, n_estimators=100)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict_proba(X_test)[:, 1]

lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict_proba(X_test)[:, 1]

# Poids pour les modèles
weight_rf = 0.3
weight_xgb = 0.3
weight_lr = 0.4

# Moyenne pondérée des prédictions
ensemble_pred_weighted = (weight_rf * rf_pred + weight_xgb * xgb_pred + weight_lr * lr_pred)
final_pred_weighted = (ensemble_pred_weighted >= 0.5).astype(int)

# Évaluation
print("Accuracy (Weighted Average):", accuracy_score(y_test, final_pred_weighted))

Accuracy (Weighted Average): 0.6573208722741433


# Combine Evaluation on a test set : Méthode 4 (réseau de neurones)

In [80]:
# Initialisation des modèles de base
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(random_state=42, max_iter=1000)

# Placeholder pour les prédictions issues de la validation croisée
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_meta = np.zeros((X_train.shape[0], 3))  # Une colonne par modèle
test_meta = np.zeros((X_test.shape[0], 3))    # Une colonne par modèle

# Validation croisée pour les modèles de base
for train_idx, val_idx in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    # Random Forest
    rf.fit(X_train_fold, y_train_fold)
    train_meta[val_idx, 0] = rf.predict_proba(X_val_fold)[:, 1]
    test_meta[:, 0] += rf.predict_proba(X_test)[:, 1] / kf.n_splits
    
    # XGBoost
    xgb.fit(X_train_fold, y_train_fold)
    train_meta[val_idx, 1] = xgb.predict_proba(X_val_fold)[:, 1]
    test_meta[:, 1] += xgb.predict_proba(X_test)[:, 1] / kf.n_splits

    # LogisticRegression
    lr.fit(X_train_fold, y_train_fold)
    train_meta[val_idx, 2] = lr.predict_proba(X_val_fold)[:, 1]
    test_meta[:, 2] += lr.predict_proba(X_test)[:, 1] / kf.n_splits



# Construction du méta-modèle (réseau de neurones)
meta_model = Sequential([
    Dense(64, activation='relu', input_dim=train_meta.shape[1]),  # Couche cachée
    Dense(32, activation='relu'),                               # Couche cachée
    Dense(1, activation='sigmoid')                              # Couche de sortie (binaire)
])

# Compilation du réseau de neurones
meta_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Entraînement
meta_model.fit(train_meta, y_train, epochs=10, batch_size=16, verbose=1)

# Prédictions sur le jeu de test
final_pred = (meta_model.predict(test_meta) > 0.5).astype(int)

# Évaluation
print("Accuracy (Méta-modèle - Réseau de neurones) :", accuracy_score(y_test, final_pred))


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5580 - loss: 0.6777  
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6136 - loss: 0.6590
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6525 - loss: 0.6358
Epoch 4/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6349 - loss: 0.6394
Epoch 5/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6311 - loss: 0.6426
Epoch 6/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6719 - loss: 0.6247
Epoch 7/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6292 - loss: 0.6472
Epoch 8/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6509 - loss: 0.6391
Epoch 9/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

# Combine Evaluation on a test set : Méthode 5

In [85]:
# Initialisation des modèles de base
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(random_state=42)

# Placeholder pour les prédictions issues de la validation croisée
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_meta = np.zeros((X_train.shape[0], 3))  # Une colonne par modèle de base
test_meta = np.zeros((X_test.shape[0], 3))    # Une colonne par modèle de base

# Validation croisée pour les modèles de base
for train_idx, val_idx in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    # Random Forest
    rf.fit(X_train_fold, y_train_fold)
    train_meta[val_idx, 0] = rf.predict_proba(X_val_fold)[:, 1]
    test_meta[:, 0] += rf.predict_proba(X_test)[:, 1] / kf.n_splits
    
    # XGBoost
    xgb.fit(X_train_fold, y_train_fold)
    train_meta[val_idx, 1] = xgb.predict_proba(X_val_fold)[:, 1]
    test_meta[:, 1] += xgb.predict_proba(X_test)[:, 1] / kf.n_splits

    # Logistic
    lr.fit(X_train, y_train)
    train_meta[:, 2] = lr.predict_proba(X_train)[:, 1]
    test_meta[:, 2] = lr.predict_proba(X_test)[:, 1] / kf.n_splits



# Création du Dataset LightGBM
lgb_train = lgb.Dataset(train_meta, label=y_train)

# Paramètres de LightGBM
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'n_estimators': 100
}

# Entraînement du méta-modèle
meta_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)

# Prédictions sur le jeu de test
final_pred = (meta_model.predict(test_meta) > 0.5).astype(int)

# Évaluation
print("Accuracy (Méta-modèle - LightGBM):", accuracy_score(y_test, final_pred))


[LightGBM] [Info] Number of positive: 811, number of negative: 684
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 605
[LightGBM] [Info] Number of data points in the train set: 1495, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.542475 -> initscore=0.170310
[LightGBM] [Info] Start training from score 0.170310
Accuracy (Méta-modèle - LightGBM): 0.6292834890965732




# For Kaggle submission :

In [102]:
print("KAGGLE...")
sys.stdout.flush()


# This time we train our classifier on the full dataset that it is available to us.

clf = XGBClassifier(random_state=42, n_estimators=170, learning_rate=0.1, max_depth=6, subsample=1, eval_metric="logloss", booster="gbtree")
clf.fit(X, y)
predictions = []


# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in val_df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)

    period_features = pd.concat([val_df, tweet_df], axis=1)
    period_features = val_df

    ###
    period_features['TweetCount'] = period_features.groupby(['MatchID', 'PeriodID', 'ID'])['Tweet'].transform('size').fillna(0)
    period_features['TweetCount'] = period_features['TweetCount'] / period_features['TweetCount'].max()

    period_features['FootballWordCount'] = period_features['Tweet'].apply(count_football_words).fillna(0)
    period_features['FootballWordCount'] = period_features['FootballWordCount'] / period_features['FootballWordCount'].max()

    period_features['Sentiment'] = period_features['Tweet'].apply(get_sentiment_rate).fillna(0)
    ###

    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X_pred = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X_pred)
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])

pred_df = pd.concat(predictions)
pred_df.to_csv('predictions.csv', index=False)


print("KAGGLE : OK")
sys.stdout.flush()

KAGGLE...
KAGGLE : OK
