In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
import re
import swifter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform, Orthogonal
import random
# Enable tqdm for pandas
tqdm.pandas()

# Ensure Reproducibility
import random
# Set seeds for reproducibility
SEED = 42

# Python's built-in random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Set Python hash seed
os.environ['PYTHONHASHSEED'] = str(SEED)

# Configure TensorFlow for deterministic operations
tf.keras.utils.set_random_seed(SEED)  # Sets all random seeds for the program (Python, NumPy, and TensorFlow)
tf.config.experimental.enable_op_determinism()  # Enable deterministic operations in TensorFlow

# If using GPU, you might also want to set these:
if tf.config.list_physical_devices('GPU'):
    # Force TensorFlow to use deterministic GPU operations
    tf.config.experimental.enable_op_determinism()
    # Limit GPU memory growth
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Limit to one GPU if using multiple GPUs
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

2024-12-12 09:25:15.723994: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load GloVe model
glove_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

In [3]:
# Country list for team extraction
country_list = [
    "Argentina", "Belgium", "Germany", "Serbia", "Greece", "IvoryCoast", 
    "Netherlands", "Mexico", "Australia", "Spain", "SouthKorea", 
    "Cameroon", "Brazil", "France", "Nigeria", "Algeria", "USA", 
    "Honduras", "Switzerland", "Croatia", "Chile", "Portugal", 
    "Ghana", "Slovenia"
]

country_variations = {
    'argentina': ['argentina', 'arg', 'argentine', 'argies', 'albiceleste', 'argentinian', 'argentinos', 'argentinas'],
    'australia': ['australia', 'aus', 'aussie', 'aussies', 'socceroos', 'oz', 'straya', 'australian', 'au'],
    'belgium': ['belgium', 'bel', 'belgique', 'belgie', 'belgian', 'belgians', 'red devils', 'diables rouges'],
    'brazil': ['brazil', 'bra', 'brasil', 'bresil', 'brazilian', 'brazilians', 'selecao', 'canarinho', 'verde amarela', 'samba boys'],
    'cameroon': ['cameroon', 'cmr', 'cameroun', 'camerounais', 'indomitable lions', 'lions'],
    'france': ['france', 'fra', 'french', 'les bleus', 'tricolore', 'tricolores', 'equipe de france', 'allez les bleus'],
    'honduras': ['honduras', 'hon', 'honduran', 'hondurans', 'los catrachos', 'catrachos', 'la h'],
    'portugal': ['portugal', 'por', 'portuguese', 'selecao das quinas', 'seleccao', 'navegadores', 'team portugal'],
    'spain': ['spain', 'esp', 'espana', 'espania', 'spanish', 'la roja', 'furia roja', 'la furia', 'la seleccion'],
    'southkorea': ['south korea', 'korea', 'kor', 'skorea', 'korean', 'koreans', 'taeguk warriors', 'warriors'],
    'switzerland': ['switzerland', 'sui', 'suisse', 'schweiz', 'swiss', 'nati', 'rossocrociati', 'a team'],
    'usa': ['usa', 'united states', 'america', 'united states of america', 'us', 'usa', 'usmnt', 'americans', 'american', 'yanks', 'uncle sam', 'stars and stripes', 'team usa'],
    'ghana': ['ghana', 'gha', 'ghanaian', 'ghanaians', 'black stars', 'stars'],
    'netherlands': ['netherlands', 'ned', 'holland', 'dutch', 'oranje', 'flying dutchmen', 'orange', 'clockwork orange', 'nederlands'],
    'germany': ['germany', 'ger', 'alemania', 'deutschland', 'german', 'germans', 'die mannschaft', 'nationalelf', 'deu'],
    'iran': ['iran', 'irn', 'iranian', 'iranians', 'team melli', 'persian stars'],
    'nigeria': ['nigeria', 'nga', 'naija', 'super eagles', 'eagles', 'nigerian', 'nigerians', 'green eagles'],
    'algeria': ['algeria', 'alg', 'algerian', 'algerians', 'fennecs', 'desert foxes', 'les verts'],
    'croatia': ['croatia', 'cro', 'hrvatska', 'hrv', 'croatian', 'croatians', 'vatreni', 'blazers', 'kockasti'],
    'chile': ['chile', 'chi', 'chilean', 'chileans', 'la roja', 'team chile'],
    'slovenia': ['slovenia', 'svn', 'slovenian', 'slovenians', 'slovenski', 'boys'],
    'serbia': ['serbia', 'srb', 'serbian', 'serbians', 'beli orlovi', 'white eagles', 'orlovi'],
    'greece': ['greece', 'gre', 'greek', 'greeks', 'piratiko', 'ethniki', 'galanolefki'],
    'ivorycoast': ['ivory coast', 'civ', 'cote divoire', 'cotedivoire', 'ivorians', 'les elephants', 'elephants', 'ivory'],
    'mexico': ['mexico', 'mex', 'mexiko', 'mexican', 'mexicans', 'el tri', 'tricolor', 'aztecas', 'el tricolor', 'verde']
}

# Define the country code mapping
country_code_mapping = {
    "Argentina": "ar", "Belgium": "be", "Germany": "de", "Serbia": "rs", "Greece": "gr",
    "IvoryCoast": "ci", "Netherlands": "nl", "Mexico": "mx", "Australia": "au", "Spain": "es",
    "SouthKorea": "kr", "Cameroon": "cm", "Brazil": "br", "France": "fr", "Nigeria": "ng",
    "Algeria": "dz", "USA": "us", "Honduras": "hn", "Switzerland": "ch", "Croatia": "hr",
    "Chile": "cl", "Portugal": "pt", "Ghana": "gh", "Slovenia": "si"
}


In [21]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

def extract_teams_from_filename(filename):
    # Remove numbers and file extension
    base_name = re.sub(r'\d+\.csv$', '', filename)
    
    # Identify teams from the predefined country list
    teams = [country for country in country_list if country in base_name]
    
    if len(teams) >= 2:
        return teams[0], teams[1]
    elif len(teams) == 1:
        return teams[0], "Unknown"
    else:
        return "Unknown", "Unknown"
    
def normalize_countries_with_teams(tweet, home_team, away_team):
    """
    Normalize country mentions in a tweet by replacing them with 'hometeam', 'awayteam', or 'othercountry'.
    """
    
    # Ensure tweet is split into words
    if isinstance(tweet, str):
        words = tweet.split()
    else:
        raise ValueError("Tweet is not a string.")
    
    # Create a copy of words for modification
    normalized_words = words[:]
    
    # Replace country mentions
    for i, word in enumerate(words):
        if word in country_variations.get(home_team.lower(), []):
            normalized_words[i] = 'hometeam'
        elif word in country_variations.get(away_team.lower(), []):
            normalized_words[i] = 'awayteam'
        else:
            for country, variations in country_variations.items():
                if word in variations and country.lower() not in [home_team.lower(), away_team.lower()]:
                    normalized_words[i] = 'othercountry'
                    break
    
    normalized_tweet = " ".join(normalized_words)
    return normalized_tweet


# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Function to map team names to country codes
def map_team_to_country_code(team_name, country_code_mapping):
    return country_code_mapping.get(team_name, "unknown")


def remove_player_names(row, football_player_data):
    """
    Replace player names in the tweet with 'footballplayername'.
    """    
    # Tokenize the tweet
    words = row['Tweet'].split()
    
    # Normalize player names
    normalized_words = [
        'footballplayername' if word in football_player_data else word for word in words
    ]
    
    return ' '.join(normalized_words)

# Function to extract mentions, excluding those from retweets
def extract_mentions(text):
    # Find all mentions in the text
    mentions = re.findall(r'@\w+', text)
    # Exclude mentions in the format "RT @NAME:"
    filtered_mentions = [
        mention for mention in mentions if not re.search(rf'^RT {mention}:', text)
    ]
    return filtered_mentions

# Function to extract mentions, excluding those from retweets
def extract_mentions(text):
    # Find all mentions in the text
    hashtags = re.findall(r'#\w+', text)

    return hashtags



In [5]:
#####PREPROCESSING TRAIN DATA AND SAVING IT#####

# # Directories
input_folder = "train_tweets"
output_folder = "train_tweets_original_preprocess"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Extract teams from filename
    filename = os.path.basename(file_path)
    home_team, away_team = extract_teams_from_filename(filename) 
    current_df['HomeTeam'] = home_team
    current_df['AwayTeam'] = away_team
    
# Extract mentions from each tweet using swifter
    current_df['Mentions'] = current_df['Tweet'].swifter.apply(extract_mentions)
    current_df['hashtags'] = current_df['Tweet'].swifter.apply(extract_mentions)
    current_df['is_RT'] = current_df['Tweet'].str.startswith('RT')
    
    # Preprocess the tweets
    current_df['Tweet'] = current_df.swifter.apply(lambda row: preprocess_text(row['Tweet']), axis=1)
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)

Pandas Apply: 100%|██████████| 86843/86843 [00:00<00:00, 332067.29it/s]
Pandas Apply: 100%|██████████| 86843/86843 [00:00<00:00, 180427.85it/s]
Pandas Apply: 100%|██████████| 86843/86843 [00:29<00:00, 2918.44it/s]
Pandas Apply: 100%|██████████| 272389/272389 [00:01<00:00, 220595.56it/s]
Pandas Apply: 100%|██████████| 272389/272389 [00:00<00:00, 283355.14it/s]
Pandas Apply: 100%|██████████| 272389/272389 [01:19<00:00, 3438.58it/s]
Pandas Apply: 100%|██████████| 148298/148298 [00:00<00:00, 329383.54it/s]
Pandas Apply: 100%|██████████| 148298/148298 [00:00<00:00, 313498.74it/s]
Pandas Apply: 100%|██████████| 148298/148298 [00:48<00:00, 3048.99it/s]
Pandas Apply: 100%|██████████| 973985/973985 [00:03<00:00, 313143.99it/s]
Pandas Apply: 100%|██████████| 973985/973985 [00:02<00:00, 344643.02it/s]
Pandas Apply: 100%|██████████| 973985/973985 [04:40<00:00, 3471.39it/s]
Pandas Apply: 100%|██████████| 99192/99192 [00:00<00:00, 354969.59it/s]
Pandas Apply: 100%|██████████| 99192/99192 [00:00<00:0

In [12]:
# # Directories
input_folder = "train_tweets_original_preprocess"
output_folder = "train_tweets_original_preprocess_with_features"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    filename = os.path.basename(file_path)
    
    current_df['TweetLength'] = current_df['Tweet'].apply(len)
    current_df['TweetCount'] = current_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
    current_df['WordCount'] = current_df['Tweet'].swifter.apply(lambda x: len(x.split()))
    
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)

Pandas Apply: 100%|██████████| 86843/86843 [00:00<00:00, 408622.79it/s]
Pandas Apply: 100%|██████████| 272389/272389 [00:00<00:00, 494769.77it/s]
Pandas Apply: 100%|██████████| 148298/148298 [00:00<00:00, 512110.11it/s]
Pandas Apply: 100%|██████████| 973985/973985 [00:01<00:00, 508756.90it/s]
Pandas Apply: 100%|██████████| 99192/99192 [00:00<00:00, 435926.17it/s]
Pandas Apply: 100%|██████████| 95108/95108 [00:00<00:00, 490404.11it/s]
Pandas Apply: 100%|██████████| 712525/712525 [00:01<00:00, 537395.46it/s]
Pandas Apply: 100%|██████████| 525725/525725 [00:00<00:00, 533097.07it/s]
Pandas Apply: 100%|██████████| 155549/155549 [00:00<00:00, 535345.60it/s]
Pandas Apply: 100%|██████████| 367899/367899 [00:00<00:00, 507257.30it/s]
Pandas Apply: 100%|██████████| 96834/96834 [00:00<00:00, 412502.12it/s]
Pandas Apply: 100%|██████████| 41539/41539 [00:00<00:00, 486359.75it/s]
Pandas Apply: 100%|██████████| 824241/824241 [00:01<00:00, 487160.55it/s]
Pandas Apply: 100%|██████████| 313803/313803 [00

In [22]:
##### PREPROCESSING TRAIN DATA AND SAVING IT #####

# Directories
input_folder = "train_tweets_original_preprocess_with_features"
output_folder = "train_tweets_no_country"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Preprocess the 'Tweet' column
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: normalize_countries_with_teams(row['Tweet'], row['HomeTeam'], row['AwayTeam']), axis=1
    )

    # Save the processed DataFrame to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)

Pandas Apply: 100%|██████████| 86843/86843 [00:08<00:00, 10564.03it/s]
Pandas Apply: 100%|██████████| 272389/272389 [00:20<00:00, 13486.44it/s]
Pandas Apply: 100%|██████████| 148298/148298 [00:12<00:00, 12002.11it/s]
Pandas Apply: 100%|██████████| 973985/973985 [01:01<00:00, 15733.80it/s]
Pandas Apply: 100%|██████████| 99192/99192 [00:07<00:00, 12642.01it/s]
Pandas Apply: 100%|██████████| 95108/95108 [00:07<00:00, 12208.99it/s]
Pandas Apply: 100%|██████████| 712525/712525 [00:45<00:00, 15629.46it/s]
Pandas Apply: 100%|██████████| 525725/525725 [00:35<00:00, 14676.78it/s]
Pandas Apply: 100%|██████████| 155549/155549 [00:12<00:00, 12200.16it/s]
Pandas Apply: 100%|██████████| 367899/367899 [00:24<00:00, 14912.78it/s]
Pandas Apply: 100%|██████████| 96834/96834 [00:07<00:00, 12707.94it/s]
Pandas Apply: 100%|██████████| 41539/41539 [00:03<00:00, 12811.16it/s]
Pandas Apply: 100%|██████████| 824241/824241 [00:51<00:00, 16037.26it/s]
Pandas Apply: 100%|██████████| 313803/313803 [00:20<00:00, 15

In [23]:
##### PREPROCESSING TRAIN DATA AND SAVING IT #####
# Load mentions with preprocessed mentions
mentions_with_is_name = pd.read_pickle("mentions_processed.pkl")
combined_player_names = set(mentions_with_is_name["PreprocessedMention"].dropna().unique())  # Use set for faster lookup

# Directories
input_folder = "train_tweets_no_country"
output_folder = "train_tweets_preprocessed_no_country_no_player"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists


# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    print(f"Processing: {file_path}")
    current_df = pd.read_csv(file_path)

    # Apply the remove_player_names function
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: remove_player_names(row, combined_player_names), axis=1
    )
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Processing: train_tweets_no_country/AustraliaSpain34.csv


Pandas Apply: 100%|██████████| 86843/86843 [00:01<00:00, 79643.97it/s]


Saved: train_tweets_preprocessed_no_country_no_player/AustraliaSpain34.csv
Processing: train_tweets_no_country/PortugalGhana58.csv


Pandas Apply: 100%|██████████| 272389/272389 [00:02<00:00, 92508.33it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/PortugalGhana58.csv
Processing: train_tweets_no_country/CameroonBrazil36.csv


Pandas Apply: 100%|██████████| 148298/148298 [00:01<00:00, 86357.15it/s]


Saved: train_tweets_preprocessed_no_country_no_player/CameroonBrazil36.csv
Processing: train_tweets_no_country/GermanyBrazil74.csv


Pandas Apply: 100%|██████████| 973985/973985 [00:10<00:00, 88751.24it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/GermanyBrazil74.csv
Processing: train_tweets_no_country/BelgiumSouthKorea59.csv


Pandas Apply: 100%|██████████| 99192/99192 [00:01<00:00, 83555.79it/s]


Saved: train_tweets_preprocessed_no_country_no_player/BelgiumSouthKorea59.csv
Processing: train_tweets_no_country/NetherlandsChile35.csv


Pandas Apply: 100%|██████████| 95108/95108 [00:01<00:00, 89069.89it/s]


Saved: train_tweets_preprocessed_no_country_no_player/NetherlandsChile35.csv
Processing: train_tweets_no_country/GermanyAlgeria67.csv


Pandas Apply: 100%|██████████| 712525/712525 [00:08<00:00, 88024.84it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/GermanyAlgeria67.csv
Processing: train_tweets_no_country/FranceGermany70.csv


Pandas Apply: 100%|██████████| 525725/525725 [00:06<00:00, 85860.28it/s]


Saved: train_tweets_preprocessed_no_country_no_player/FranceGermany70.csv
Processing: train_tweets_no_country/MexicoCroatia37.csv


Pandas Apply: 100%|██████████| 155549/155549 [00:01<00:00, 90315.22it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/MexicoCroatia37.csv
Processing: train_tweets_no_country/FranceNigeria66.csv


Pandas Apply: 100%|██████████| 367899/367899 [00:04<00:00, 86827.49it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/FranceNigeria66.csv
Processing: train_tweets_no_country/AustraliaNetherlands29.csv


Pandas Apply: 100%|██████████| 96834/96834 [00:01<00:00, 90937.09it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/AustraliaNetherlands29.csv
Processing: train_tweets_no_country/HondurasSwitzerland54.csv


Pandas Apply: 100%|██████████| 41539/41539 [00:00<00:00, 86359.98it/s]


Saved: train_tweets_preprocessed_no_country_no_player/HondurasSwitzerland54.csv
Processing: train_tweets_no_country/ArgentinaGermanyFinal77.csv


Pandas Apply: 100%|██████████| 824241/824241 [00:09<00:00, 90504.73it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/ArgentinaGermanyFinal77.csv
Processing: train_tweets_no_country/ArgentinaBelgium72.csv


Pandas Apply: 100%|██████████| 313803/313803 [00:03<00:00, 92224.15it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/ArgentinaBelgium72.csv
Processing: train_tweets_no_country/USASlovenia2010.csv


Pandas Apply: 100%|██████████| 85675/85675 [00:01<00:00, 85174.09it/s]


Saved: train_tweets_preprocessed_no_country_no_player/USASlovenia2010.csv
Processing: train_tweets_no_country/GermanyUSA57.csv


Pandas Apply: 100%|██████████| 162243/162243 [00:01<00:00, 93378.72it/s] 


Saved: train_tweets_preprocessed_no_country_no_player/GermanyUSA57.csv


In [24]:
# Directory containing the preprocessed CSV files
input_folder = "train_tweets_original_preprocess_with_features"

# List all CSV files in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]
combined_trained_df = pd.concat(dataframes, ignore_index=True)

combined_trained_df.to_pickle('preprocessed_train_original.pkl')

In [26]:
# Directory containing the preprocessed CSV files
input_folder = "train_tweets_no_country"

# List all CSV files in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]
combined_trained_df = pd.concat(dataframes, ignore_index=True)

combined_trained_df.to_pickle('preprocessed_train_no_country.pkl')

In [27]:
# Directory containing the preprocessed CSV files
input_folder = "train_tweets_preprocessed_no_country_no_player"

# List all CSV files in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]
combined_trained_df = pd.concat(dataframes, ignore_index=True)

combined_trained_df.to_pickle('preprocessed_train_nc_np.pkl')

*PREPROCESSING ALL DATA AND SAVING IT*

Use this to make weights and tfidf, Could be more efficient. But eitherway nice to have preprocessed df saved.


In [28]:
#####PREPROCESSING TRAIN DATA AND SAVING IT#####

# # Directories
input_folder = "eval_tweets"
output_folder = "eval_tweets_original_preprocess"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Extract teams from filename
    filename = os.path.basename(file_path)
    home_team, away_team = extract_teams_from_filename(filename) 
    current_df['HomeTeam'] = home_team
    current_df['AwayTeam'] = away_team
    
# Extract mentions from each tweet using swifter
    current_df['Mentions'] = current_df['Tweet'].swifter.apply(extract_mentions)
    current_df['hashtags'] = current_df['Tweet'].swifter.apply(extract_mentions)
    current_df['is_RT'] = current_df['Tweet'].str.startswith('RT')
    
    # Preprocess the tweets
    current_df['Tweet'] = current_df.swifter.apply(lambda row: preprocess_text(row['Tweet']), axis=1)
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)

Pandas Apply: 100%|██████████| 285804/285804 [00:01<00:00, 182248.56it/s]
Pandas Apply: 100%|██████████| 285804/285804 [00:04<00:00, 58128.90it/s] 
Pandas Apply: 100%|██████████| 285804/285804 [01:27<00:00, 3264.10it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:00<00:00, 292213.81it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:00<00:00, 349161.49it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:13<00:00, 3291.55it/s]
Pandas Apply: 100%|██████████| 628698/628698 [00:02<00:00, 258382.12it/s]
Pandas Apply: 100%|██████████| 628698/628698 [00:01<00:00, 354098.91it/s]
Pandas Apply: 100%|██████████| 628698/628698 [03:05<00:00, 3390.83it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:01<00:00, 64915.40it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:00<00:00, 431383.00it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:32<00:00, 3486.78it/s]


In [29]:
# # Directories
input_folder = "eval_tweets_original_preprocess"
output_folder = "eval_tweets_original_preprocess_with_features"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    filename = os.path.basename(file_path)
    
    current_df['TweetLength'] = current_df['Tweet'].apply(len)
    current_df['TweetCount'] = current_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
    current_df['WordCount'] = current_df['Tweet'].swifter.apply(lambda x: len(x.split()))
    
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)

Pandas Apply: 100%|██████████| 285804/285804 [00:00<00:00, 496188.55it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:00<00:00, 419867.76it/s]
Pandas Apply: 100%|██████████| 628698/628698 [00:01<00:00, 550472.98it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:00<00:00, 546365.67it/s]


In [30]:
##### PREPROCESSING TRAIN DATA AND SAVING IT #####

# Directories
input_folder = "eval_tweets_original_preprocess_with_features"
output_folder = "eval_tweets_no_country"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Preprocess the 'Tweet' column
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: normalize_countries_with_teams(row['Tweet'], row['HomeTeam'], row['AwayTeam']), axis=1
    )

    # Save the processed DataFrame to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)
    

Pandas Apply: 100%|██████████| 285804/285804 [00:20<00:00, 13747.56it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:03<00:00, 13668.73it/s]
Pandas Apply: 100%|██████████| 628698/628698 [00:38<00:00, 16250.83it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:08<00:00, 13281.84it/s]


In [31]:
##### PREPROCESSING TRAIN DATA AND SAVING IT #####
# Load mentions with preprocessed mentions
mentions_with_is_name = pd.read_pickle("mentions_processed.pkl")
combined_player_names = set(mentions_with_is_name["PreprocessedMention"].dropna().unique())  # Use set for faster lookup

# Directories
input_folder = "eval_tweets_no_country"
output_folder = "eval_tweets_preprocessed_no_country_no_player"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists


# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    print(f"Processing: {file_path}")
    current_df = pd.read_csv(file_path)

    # Apply the remove_player_names function
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: remove_player_names(row, combined_player_names), axis=1
    )
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Processing: eval_tweets_no_country/GermanyGhana32.csv


Pandas Apply: 100%|██████████| 285804/285804 [00:04<00:00, 70072.15it/s]


Saved: eval_tweets_preprocessed_no_country_no_player/GermanyGhana32.csv
Processing: eval_tweets_no_country/GermanySerbia2010.csv


Pandas Apply: 100%|██████████| 45024/45024 [00:00<00:00, 81115.71it/s]


Saved: eval_tweets_preprocessed_no_country_no_player/GermanySerbia2010.csv
Processing: eval_tweets_no_country/NetherlandsMexico64.csv


Pandas Apply: 100%|██████████| 628698/628698 [00:07<00:00, 80921.16it/s]


Saved: eval_tweets_preprocessed_no_country_no_player/NetherlandsMexico64.csv
Processing: eval_tweets_no_country/GreeceIvoryCoast44.csv


Pandas Apply: 100%|██████████| 113402/113402 [00:01<00:00, 82763.22it/s]


Saved: eval_tweets_preprocessed_no_country_no_player/GreeceIvoryCoast44.csv


In [32]:
# Directory containing the preprocessed CSV files
input_folder = "eval_tweets_original_preprocess_with_features"

# List all CSV files in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]
combined_trained_df = pd.concat(dataframes, ignore_index=True)

combined_trained_df.to_pickle('preprocessed_eval_original.pkl')

In [34]:
# Directory containing the preprocessed CSV files
input_folder = "eval_tweets_no_country"

# List all CSV files in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]
combined_trained_df = pd.concat(dataframes, ignore_index=True)

combined_trained_df.to_pickle('preprocessed_eval_no_country.pkl')

In [35]:
# Directory containing the preprocessed CSV files
input_folder = "eval_tweets_preprocessed_no_country_no_player"

# List all CSV files in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes = [pd.read_csv(file) for file in csv_files]
combined_trained_df = pd.concat(dataframes, ignore_index=True)

combined_trained_df.to_pickle('preprocessed_eval_nc_np.pkl')

In [36]:
# Load the two pickle files into separate DataFrames
train_df = pd.read_pickle('preprocessed_train_original.pkl')
eval_df = pd.read_pickle('preprocessed_eval_original.pkl')

# Concatenate the DataFrames
full_df = pd.concat([train_df, eval_df], ignore_index=True)

# Save the combined DataFrame as a new pickle file
full_df.to_pickle('preprocessed_full_original.pkl')

In [37]:
# Load the two pickle files into separate DataFrames
train_df = pd.read_pickle('preprocessed_train_no_country.pkl')
eval_df = pd.read_pickle('preprocessed_eval_no_country.pkl')

# Concatenate the DataFrames
full_df = pd.concat([train_df, eval_df], ignore_index=True)

# Save the combined DataFrame as a new pickle file
full_df.to_pickle('preprocessed_full_no_country.pkl')

In [39]:
# Load the two pickle files into separate DataFrames
train_df = pd.read_pickle('preprocessed_train_nc_np.pkl')
eval_df = pd.read_pickle('preprocessed_eval_nc_np.pkl')

# Concatenate the DataFrames
full_df = pd.concat([train_df, eval_df], ignore_index=True)

# Save the combined DataFrame as a new pickle file
full_df.to_pickle('preprocessed_full_nc_np.pkl')

# Generating Weighted Average Embeddings with TF-IDF
In this section, we calculate weighted average embeddings for tweets using pre-trained word embeddings and TF-IDF weights. This approach ensures that each word in a tweet contributes to the overall representation based on its importance (TF-IDF score). The resulting embedding is a single vector that captures the semantic meaning of the entire tweet, which can be used as features for downstream tasks such as classification, clustering, or similarity analysis.

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

full_df = pd.read_pickle('preprocessed_full_nc_np.pkl')

# Optional: start from zero and fit on tweets
# Initialize TF-IDF Vectorizer
vectorizer_all_tweets = TfidfVectorizer(
    max_features=15000,           # Limit vocabulary size to 10,000 terms
    min_df=3,                     # Ignore terms in fewer than 3 documents
    #max_df=0.90,                  # Ignore overly frequent terms
    sublinear_tf=True,            # Apply logarithmic scaling to term frequencies     
    norm='l2',                    # L2 normalization
) 

# Fit the TF-IDF vectorizer on minute-level documents
vectorizer_all_tweets.fit(full_df['Tweet'])

# Save the vectorizer
with open("tfidf_vectorizer_all_tweets_nc_np.pkl", "wb") as f:
    pickle.dump(vectorizer_all_tweets, f)

# Load pre-computed
with open('tfidf_vectorizer_all_tweets_nc_np.pkl', 'rb') as f:
   vectorizer_all_tweets = pickle.load(f)

# Extract TF-IDF weights
tfidf_weights_all_tweets = dict(zip(vectorizer_all_tweets.get_feature_names_out(), vectorizer_all_tweets.idf_))


# Weighted average embeddings
def get_weighted_avg_embedding(tweet, model, vector_size=200, weights=tfidf_weights_all_tweets):
    words = tweet.split()
    word_vectors = [model[word] * weights.get(word, 1) for word in words if word in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [None]:
# Generate embeddings for each tweet
loaded_df = pd.read_pickle('preprocessed_train_nc_np.pkl')

# vector_size = 200  # GloVe embedding dimension
tweet_vectors = loaded_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights_all_tweets))
tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

# Save the tweet vectors
with open("tweet_vectors_all_data.pkl", "wb") as f:
    pickle.dump(tweet_vectors, f)

print("Embeddings saved successfully!")

# Load the tweet vectors
with open("tweet_vectors_all_data.pkl", "rb") as f:
    loaded_tweet_vectors = pickle.load(f)

print("Embeddings loaded successfully!")
print("Loaded vectors shape:", loaded_tweet_vectors.shape)



Pandas Apply:  73%|███████▎  | 3624443/4961848 [05:12<01:34, 14115.26it/s]

In [None]:
###### Use if no period features ######
tweet_df = pd.DataFrame(loaded_tweet_vectors)

# Attach the vectors into the original dataframe
period_features = pd.concat([loaded_df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet', 'HomeTeam', 'AwayTeam', 'HomeTeamCode', 'AwayTeamCode'])

print("X_train_reshaped shape:", period_features.shape)
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

# Save the tweet vectors
with open("period_features.pkl", "wb") as f:
    pickle.dump(period_features, f)

print("Period features saved successfully!")

# Load the tweet vectors
with open("period_features.pkl", "rb") as f:
    loaded_period_features = pickle.load(f)

print("Period features loaded successfully!")
print("Loaded vectors shape:", loaded_period_features.shape)

In [None]:
# We drop the non-numerical features and keep the embeddings values for each period
X = loaded_period_features.drop(columns=['EventType', 'MatchID', 'ID']).values
# We extract the labels of our training samples
y = loaded_period_features['EventType'].values


# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=SEED)

# Add a time step dimension to match the LSTM input shape
X_train_reshaped = X_train[:, None, :]  # Add a new axis for timesteps
X_test_reshaped = X_test[:, None, :]    # Add a new axis for timesteps

In [None]:


# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',      # Monitor validation loss
    patience=5,              # Stop training if no improvement after 3 epochs
    restore_best_weights=True  # Restore the best weights when stopping
)

# Define the LSTM model with deterministic initializers
model = Sequential([
    tf.keras.layers.Input(shape=(1, X_train_reshaped.shape[2])),  
    LSTM(
        128, 
        return_sequences=False, 
        kernel_initializer=GlorotUniform(seed=SEED), 
        recurrent_initializer=Orthogonal(seed=SEED),
        bias_initializer='zeros'
    ),             
    Dense(y_encoded.shape[1], activation='softmax', kernel_initializer=GlorotUniform(seed=SEED))
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping],  # Include the early stopping callback
                    verbose=1)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=1)

print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
eval_data = pd.read_pickle('preprocessed_eval.pkl')
print(eval_data)

In [None]:
###### For Kaggle submission

predictions = []
dummy_predictions = []
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.

val_df = pd.read_pickle('preprocessed_eval_nc_np.pkl')

tweet_vectors = val_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights= tfidf_weights_all_tweets))

tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

tweet_df = pd.DataFrame(tweet_vectors)

period_features_val = pd.concat([val_df, tweet_df], axis=1)
period_features_val = period_features_val.drop(columns=['Timestamp', 'Tweet', 'HomeTeam', 'AwayTeam', 'HomeTeamCode', 'AwayTeamCode'])
period_features_val = period_features_val.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

X = period_features_val.drop(columns=['MatchID', 'ID']).values

# Reshape input for LSTM
X_reshaped = X[:, None, :]  # Add timestep dimension

preds = model.predict(X_reshaped)
preds = preds.argmax(axis=1)  # Convert probabilities to class indices
period_features_val['EventType'] = preds
predictions.append(period_features_val[['ID', 'EventType']])


pred_df = pd.concat(predictions)
pred_df.to_csv('LSTM_predictions.csv', index=False)



In [None]:
import pickle
# Save TF-IDF trained on the training data
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
chi2_results = remove_rows_with_names(chi2_results, name_list)



In [None]:
print(chi2_results)

In [None]:
top_words = chi2_results[:62]

In [None]:
# Create a dictionary for the chi² scores
chi2_dict = dict(zip(top_words['Word'], top_words['Chi2_Score']))

# Compute weighted score features for each important word
for word in chi2_dict.keys():
    loaded_df[f'WeightedScore_{word}'] = loaded_df['Tweet'].apply(
        lambda tweet: tweet.split().count(word) * chi2_dict.get(word, 0)
    )

print("loaded_df shape:", loaded_df.shape)


In [None]:
loaded_df.to_pickle('preprocessed_train_extra_weights.pkl')

In [None]:
correlation = loaded_df[['TweetLength', 'WordCount']].corr()
print("Correlation between TweetLength and WordCount:")
print(correlation)


In [None]:
# Average word length in a tweet


In [None]:
# Load the mentions DataFrame
mentions = pd.read_pickle("mentions_with_teams.pkl")

# Filter mentions based on their occurrences in a specific HomeTeam and AwayTeam combination
mentions['mention_count'] = mentions.groupby(['HomeTeam', 'AwayTeam', 'Mention'])['Mention'].transform('count')
mentions = mentions[mentions['mention_count'] > 10]

# Drop duplicates
mentions = mentions.drop_duplicates()

# Load the last names DataFrame
last_names_df = pd.read_csv("last_names.csv")

# Ensure the last names DataFrame contains 'last_name' and 'team' columns
if {'last_name', 'nationality'}.issubset(last_names_df.columns):
    # Add the 'is_name' column
    def is_name(mention, home_team, away_team):
        # Filter last names for the HomeTeam and AwayTeam
        team_last_names = last_names_df[last_names_df['nationality'].isin([home_team, away_team])]['last_name'].str.lower().tolist()
        # Check if any last name from these teams matches the mention
        return 1 if any(last_name in mention.lower() for last_name in team_last_names) else 0

    # Apply the function row-wise using swifter
    mentions['is_name'] = mentions.swifter.apply(
        lambda row: is_name(row['Mention'], row['HomeTeam'], row['AwayTeam']), axis=1
    )

    # Drop the temporary mention_count column
    mentions.drop(columns=['mention_count'], inplace=True)

    # Print the updated DataFrame
    print(mentions)

    # Optionally, save the updated DataFrame to a new pickle file
    mentions.to_pickle("mentions_with_is_name.pkl")

else:
    print("The last_names.csv file must contain 'last_name' and 'team' columns.")
    
    

# Load the DataFrame
mentions_with_is_name = pd.read_pickle("mentions_with_is_name.pkl")

# Convert mentions to lowercase
mentions_with_is_name['Mention'] = mentions_with_is_name['Mention']

mentions_with_is_name = mentions_with_is_name[mentions_with_is_name['is_name'] != 0]


mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('Brazil', na=False)]
mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('soccer', na=False)]
mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('ball', na=False)]
mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('Twitter', na=False)]



# Print the updated DataFrame
print(mentions_with_is_name)

# Optionally, save the updated DataFrame to a new pickle file
mentions_with_is_name.to_pickle("mentions_without_football.pkl")

mentions_with_is_name = pd.read_pickle("mentions_without_football.pkl")
mentions_with_is_name['PreprocessedMention'] = mentions_with_is_name.swifter.apply(
    lambda row: preprocess_text(row['Mention'], row['HomeTeam'], row['AwayTeam']), axis=1
)
mentions_with_is_name.to_pickle("mentions_processed.pkl")
print(mentions_with_is_name)

In [None]:
# # Directories

##### PREPROCESSING TRAIN DATA AND SAVING IT #####
import os
import pandas as pd
import swifter

# Load mentions with preprocessed mentions
mentions_with_is_name = pd.read_pickle("mentions_processed.pkl")
combined_player_names = set(mentions_with_is_name["PreprocessedMention"].dropna().unique())  # Use set for faster lookup

# Directories
input_folder = "eval_tweets_preprocessed"
output_folder = "eval_tweets_preprocessed_no_player"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Define the remove_player_names function

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    print(f"Processing: {file_path}")
    current_df = pd.read_csv(file_path)
    
    # Apply the remove_player_names function
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: remove_player_names(row, combined_player_names), axis=1
    )
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")
