In [78]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
import re
import swifter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform, Orthogonal
import random
# Enable tqdm for pandas
tqdm.pandas()

# Ensure Reproducibility
import random
# Set seeds for reproducibility
SEED = 42

# Python's built-in random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Set Python hash seed
os.environ['PYTHONHASHSEED'] = str(SEED)

# Configure TensorFlow for deterministic operations
tf.keras.utils.set_random_seed(SEED)  # Sets all random seeds for the program (Python, NumPy, and TensorFlow)
tf.config.experimental.enable_op_determinism()  # Enable deterministic operations in TensorFlow

# If using GPU, you might also want to set these:
if tf.config.list_physical_devices('GPU'):
    # Force TensorFlow to use deterministic GPU operations
    tf.config.experimental.enable_op_determinism()
    # Limit GPU memory growth
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Limit to one GPU if using multiple GPUs
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

In [69]:
# Load GloVe model
glove_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

In [79]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [None]:
# Load data
folder_path = "train_tweets"
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")][:1] # Only use first file for testing purposes
df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# Apply preprocessing
df['Tweet'] = df['Tweet'].swifter.apply(preprocess_text)
print(df)

Pandas Apply: 100%|██████████| 86843/86843 [00:16<00:00, 5303.80it/s]


<bound method NDFrame.head of           ID  MatchID  PeriodID  EventType      Timestamp  \
0        2_0        2         0          0  1403538600000   
1        2_0        2         0          0  1403538600000   
2        2_0        2         0          0  1403538600000   
3        2_0        2         0          0  1403538600000   
4        2_0        2         0          0  1403538600000   
...      ...      ...       ...        ...            ...   
86838  2_129        2       129          1  1403546400000   
86839  2_129        2       129          1  1403546400000   
86840  2_129        2       129          1  1403546400000   
86841  2_129        2       129          1  1403546400000   
86842  2_129        2       129          1  1403546400000   

                                                   Tweet  
0      rt soccerdotcom esp beat au well give away spa...  
1      visit sitep official web site httptcoehzkslan ...  
2      rt soccerdotcom esp beat au well give away spa...  
3

In [81]:
print(df)

          ID  MatchID  PeriodID  EventType      Timestamp  \
0        2_0        2         0          0  1403538600000   
1        2_0        2         0          0  1403538600000   
2        2_0        2         0          0  1403538600000   
3        2_0        2         0          0  1403538600000   
4        2_0        2         0          0  1403538600000   
...      ...      ...       ...        ...            ...   
86838  2_129        2       129          1  1403546400000   
86839  2_129        2       129          1  1403546400000   
86840  2_129        2       129          1  1403546400000   
86841  2_129        2       129          1  1403546400000   
86842  2_129        2       129          1  1403546400000   

                                                   Tweet  
0      rt soccerdotcom esp beat au well give away spa...  
1      visit sitep official web site httptcoehzkslan ...  
2      rt soccerdotcom esp beat au well give away spa...  
3      rt worldsoccershop winne

In [6]:
# Feature creation
# Add the length of each tweet as a feature
df['TweetLength'] = df['Tweet'].apply(len)

# Add a simple tweet count feature
df['TweetCount'] = df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')

# Add word count as a feature
df['WordCount'] = df['Tweet'].apply(lambda x: len(x.split()))

print(df)

             ID  MatchID  PeriodID  EventType      Timestamp  \
0           2_0        2         0          0  1403538600000   
1           2_0        2         0          0  1403538600000   
2           2_0        2         0          0  1403538600000   
3           2_0        2         0          0  1403538600000   
4           2_0        2         0          0  1403538600000   
...         ...      ...       ...        ...            ...   
5056045  17_129       17       129          1  1403805600000   
5056046  17_129       17       129          1  1403805600000   
5056047  17_129       17       129          1  1403805600000   
5056048  17_129       17       129          1  1403805600000   
5056049  17_129       17       129          1  1403805600000   

                                                     Tweet  TweetLength  \
0        rt soccerdotcom esp beat au well give away spa...          104   
1        visit sitep official web site httptcoehzkslan ...           95   
2     

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Compute TF-IDF weights for the corpus

# Optional: start from zero and fit on tweets
# vectorizer = TfidfVectorizer(max_features=10000)
# vectorizer.fit(df['Tweet'])


# Load pre-computed
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)


tfidf_weights = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

# Weighted average embeddings
def get_weighted_avg_embedding(tweet, model, vector_size=200, weights=tfidf_weights):
    words = tweet.split()
    word_vectors = [model[word] * weights.get(word, 1) for word in words if word in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [33]:
# Generate embeddings for each tweet
# vector_size = 200  # GloVe embedding dimension
# tweet_vectors = df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights))
# tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

# Save the tweet vectors
# with open("tweet_vectors.pkl", "wb") as f:
#     pickle.dump(tweet_vectors, f)
# 
# print("Embeddings saved successfully!")

# Load the tweet vectors
with open("tweet_vectors.pkl", "rb") as f:
    loaded_tweet_vectors = pickle.load(f)

print("Embeddings loaded successfully!")
print("Loaded vectors shape:", loaded_tweet_vectors.shape)



Embeddings loaded successfully!
Loaded vectors shape: (5056050, 200)


In [54]:
###### Use if no period features ######
# tweet_df = pd.DataFrame(loaded_tweet_vectors)
# 
# # Attach the vectors into the original dataframe
# period_features = pd.concat([df, tweet_df], axis=1)
# 
# # Drop the columns that are not useful anymore
# period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
# 
# print("X_train_reshaped shape:", period_features.shape)
# # Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
# period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
# 
# # Save the tweet vectors
# with open("period_features.pkl", "wb") as f:
#     pickle.dump(period_features, f)
# 
# print("Period features saved successfully!")

# Load the tweet vectors
import pickle
with open("period_features.pkl", "rb") as f:
    loaded_period_features = pickle.load(f)

print("Period features loaded successfully!")
print("Loaded vectors shape:", loaded_period_features.shape)
print(loaded_period_features)

Period features loaded successfully!
Loaded vectors shape: (2137, 207)
      MatchID  PeriodID      ID  EventType  TweetLength  TweetCount  \
0           0         0     0_0        0.0    90.829167    5.041667   
1           0         1     0_1        0.0    85.770053    4.358289   
2           0         2     0_2        0.0    87.784810    4.881857   
3           0         3     0_3        0.0    85.276451    6.440273   
4           0         4     0_4        0.0    89.645309   13.732265   
...       ...       ...     ...        ...          ...         ...   
2132       19       125  19_125        1.0    77.492706   76.553492   
2133       19       126  19_126        1.0    79.686723   85.419807   
2134       19       127  19_127        1.0    79.435999   84.739234   
2135       19       128  19_128        1.0    79.569395   75.923932   
2136       19       129  19_129        1.0    84.503898   75.678102   

      WordCount         0         1         2  ...       190       191  \
0 

In [65]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Drop non-numerical features
X = loaded_period_features.drop(columns=['EventType', 'MatchID', 'ID']).values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Extract labels
y = loaded_period_features['EventType'].values

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=SEED)

# Print shapes for verification
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (1709, 204)
X_test shape: (428, 204)


In [77]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform grid search
grid_search = GridSearchCV(
    XGBClassifier(objective='binary:logistic', seed=SEED),
    param_grid, cv=5, scoring='f1', verbose=1
)
grid_search.fit(X_train, y_train.argmax(axis=1))

print("Best parameters:", grid_search.best_params_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

# Calculate scale_pos_weight
num_pos = sum(y_train.argmax(axis=1) == 1)
num_neg = sum(y_train.argmax(axis=1) == 0)
scale_pos_weight = num_neg / num_pos

# Initialize XGBoost model with regularization
xgb_model = XGBClassifier(
    objective='binary:logistic',
    max_depth=5,
    learning_rate=0.01,
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    reg_alpha=0.5,  # L1 regularization
    reg_lambda=1,   # L2 regularization
    seed=SEED
)

# Perform cross-validation
cv_scores = cross_val_score(
    xgb_model, X_train, y_train.argmax(axis=1), cv=5, scoring='accuracy'
)

# Print cross-validation results
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Train the model on the entire training set
xgb_model.fit(X_train, y_train.argmax(axis=1))

# Evaluate the model on the test set
y_pred = xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test.argmax(axis=1), y_pred)

print(f"Test Accuracy: {test_accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test.argmax(axis=1), y_pred))

Cross-validation Accuracy: 0.7326 ± 0.0238
Test Accuracy: 0.7897
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.81      0.79       203
           1       0.82      0.77      0.79       225

    accuracy                           0.79       428
   macro avg       0.79      0.79      0.79       428
weighted avg       0.79      0.79      0.79       428



In [73]:
###### For Kaggle submission

import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

# Initialize lists for storing predictions
predictions = []

# Loop through evaluation tweets files
for fname in sorted(os.listdir("eval_tweets")):
    val_df = pd.read_csv(f"eval_tweets/{fname}")

    # Preprocess tweets
    val_df['Tweet'] = val_df['Tweet'].swifter.apply(preprocess_text)

    # Feature creation
    val_df['TweetLength'] = val_df['Tweet'].apply(len)
    val_df['TweetCount'] = val_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
    val_df['WordCount'] = val_df['Tweet'].apply(lambda x: len(x.split()))

    # Generate tweet embeddings
    tweet_vectors = val_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights))
    tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

    # Combine embeddings with features
    tweet_df = pd.DataFrame(tweet_vectors)
    period_features_val = pd.concat([val_df, tweet_df], axis=1)
    period_features_val = period_features_val.drop(columns=['Timestamp', 'Tweet'])
    period_features_val = period_features_val.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

    # Prepare input features for the model
    X = period_features_val.drop(columns=['MatchID', 'ID']).values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Predict using the XGBoost model
    preds = xgb_model.predict(X_scaled)

    # Store predictions
    period_features_val['EventType'] = preds
    predictions.append(period_features_val[['ID', 'EventType']])

# Concatenate all predictions and export to CSV
pred_df = pd.concat(predictions)
pred_df.to_csv('XGBoost_predictions.csv', index=False)

Pandas Apply: 100%|██████████| 285804/285804 [00:47<00:00, 5991.43it/s]
Pandas Apply: 100%|██████████| 285804/285804 [00:15<00:00, 18258.13it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:07<00:00, 6169.60it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:02<00:00, 17259.49it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:18<00:00, 6284.89it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:05<00:00, 19685.13it/s]
Pandas Apply: 100%|██████████| 628698/628698 [01:39<00:00, 6325.44it/s]
Pandas Apply: 100%|██████████| 628698/628698 [00:31<00:00, 20123.59it/s]
