In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
import re
import swifter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform, Orthogonal
import random
# Enable tqdm for pandas
tqdm.pandas()

# Ensure Reproducibility
import random
# Set seeds for reproducibility
SEED = 42

# Python's built-in random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Set Python hash seed
os.environ['PYTHONHASHSEED'] = str(SEED)

# Configure TensorFlow for deterministic operations
tf.keras.utils.set_random_seed(SEED)  # Sets all random seeds for the program (Python, NumPy, and TensorFlow)
tf.config.experimental.enable_op_determinism()  # Enable deterministic operations in TensorFlow

# If using GPU, you might also want to set these:
if tf.config.list_physical_devices('GPU'):
    # Force TensorFlow to use deterministic GPU operations
    tf.config.experimental.enable_op_determinism()
    # Limit GPU memory growth
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Limit to one GPU if using multiple GPUs
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

2024-12-11 13:28:40.593488: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load GloVe model
glove_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

In [None]:
# Country list for team extraction
country_list = [
    "Argentina", "Belgium", "Germany", "Serbia", "Greece", "IvoryCoast", 
    "Netherlands", "Mexico", "Australia", "Spain", "SouthKorea", 
    "Cameroon", "Brazil", "France", "Nigeria", "Algeria", "USA", 
    "Honduras", "Switzerland", "Croatia", "Chile", "Portugal", 
    "Ghana", "Slovenia"
]

country_variations = {
    'argentina': ['argentina', 'arg', 'argentine', 'argies', 'albiceleste', 'argentinian', 'argentinos', 'argentinas'],
    'australia': ['australia', 'aus', 'aussie', 'aussies', 'socceroos', 'oz', 'straya', 'australian', 'au'],
    'belgium': ['belgium', 'bel', 'belgique', 'belgie', 'belgian', 'belgians', 'red devils', 'diables rouges'],
    'brazil': ['brazil', 'bra', 'brasil', 'bresil', 'brazilian', 'brazilians', 'selecao', 'canarinho', 'verde amarela', 'samba boys'],
    'cameroon': ['cameroon', 'cmr', 'cameroun', 'camerounais', 'indomitable lions', 'lions'],
    'france': ['france', 'fra', 'french', 'les bleus', 'tricolore', 'tricolores', 'equipe de france', 'allez les bleus'],
    'honduras': ['honduras', 'hon', 'honduran', 'hondurans', 'los catrachos', 'catrachos', 'la h'],
    'portugal': ['portugal', 'por', 'portuguese', 'selecao das quinas', 'seleccao', 'navegadores', 'team portugal'],
    'spain': ['spain', 'esp', 'espana', 'espania', 'spanish', 'la roja', 'furia roja', 'la furia', 'la seleccion'],
    'southkorea': ['south korea', 'korea', 'kor', 'skorea', 'korean', 'koreans', 'taeguk warriors', 'warriors'],
    'switzerland': ['switzerland', 'sui', 'suisse', 'schweiz', 'swiss', 'nati', 'rossocrociati', 'a team'],
    'usa': ['usa', 'united states', 'america', 'united states of america', 'us', 'usa', 'usmnt', 'americans', 'american', 'yanks', 'uncle sam', 'stars and stripes', 'team usa'],
    'ghana': ['ghana', 'gha', 'ghanaian', 'ghanaians', 'black stars', 'stars'],
    'netherlands': ['netherlands', 'ned', 'holland', 'dutch', 'oranje', 'flying dutchmen', 'orange', 'clockwork orange', 'nederlands'],
    'germany': ['germany', 'ger', 'alemania', 'deutschland', 'german', 'germans', 'die mannschaft', 'nationalelf', 'deu'],
    'iran': ['iran', 'irn', 'iranian', 'iranians', 'team melli', 'persian stars'],
    'nigeria': ['nigeria', 'nga', 'naija', 'super eagles', 'eagles', 'nigerian', 'nigerians', 'green eagles'],
    'algeria': ['algeria', 'alg', 'algerian', 'algerians', 'fennecs', 'desert foxes', 'les verts'],
    'croatia': ['croatia', 'cro', 'hrvatska', 'hrv', 'croatian', 'croatians', 'vatreni', 'blazers', 'kockasti'],
    'chile': ['chile', 'chi', 'chilean', 'chileans', 'la roja', 'team chile'],
    'slovenia': ['slovenia', 'svn', 'slovenian', 'slovenians', 'slovenski', 'boys'],
    'serbia': ['serbia', 'srb', 'serbian', 'serbians', 'beli orlovi', 'white eagles', 'orlovi'],
    'greece': ['greece', 'gre', 'greek', 'greeks', 'piratiko', 'ethniki', 'galanolefki'],
    'ivorycoast': ['ivory coast', 'civ', 'cote divoire', 'cotedivoire', 'ivorians', 'les elephants', 'elephants', 'ivory'],
    'mexico': ['mexico', 'mex', 'mexiko', 'mexican', 'mexicans', 'el tri', 'tricolor', 'aztecas', 'el tricolor', 'verde']
}

# Define the country code mapping
country_code_mapping = {
    "Argentina": "ar", "Belgium": "be", "Germany": "de", "Serbia": "rs", "Greece": "gr",
    "IvoryCoast": "ci", "Netherlands": "nl", "Mexico": "mx", "Australia": "au", "Spain": "es",
    "SouthKorea": "kr", "Cameroon": "cm", "Brazil": "br", "France": "fr", "Nigeria": "ng",
    "Algeria": "dz", "USA": "us", "Honduras": "hn", "Switzerland": "ch", "Croatia": "hr",
    "Chile": "cl", "Portugal": "pt", "Ghana": "gh", "Slovenia": "si"
}


In [23]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

def extract_teams_from_filename(filename):
    # Remove numbers and file extension
    base_name = re.sub(r'\d+\.csv$', '', filename)
    
    # Identify teams from the predefined country list
    teams = [country for country in country_list if country in base_name]
    
    if len(teams) >= 2:
        return teams[0], teams[1]
    elif len(teams) == 1:
        return teams[0], "Unknown"
    else:
        return "Unknown", "Unknown"
    
def normalize_countries_with_teams(words, home_team, away_team):
    # Text is already preprocessed and split into words
    normalized_words = words.copy()
    
    # Replace individual country mentions
    for i, word in enumerate(words):
        # Check if it's home team
        if word in country_variations.get(home_team.lower(), []):
            normalized_words[i] = 'hometeam'
        # Check if it's away team
        elif word in country_variations.get(away_team.lower(), []):
            normalized_words[i] = 'awayteam'
        else:
            # Check if it's any other country
            for country, variations in country_variations.items():
                if word in variations and country.lower() not in [home_team.lower(), away_team.lower()]:
                    normalized_words[i] = 'othercountry'
                    break
    
    return normalized_words

def normalize_football_player_names_with_country(words, country, football_player_data):
    """
    Normalize football player names in a list of words to 'footballplayername', filtering by country.

    Args:
        words (list of str): List of words (preprocessed text).
        country (str): Country code of the players to focus on.
        football_player_data (dict): Dictionary mapping country codes to lists of last names.

    Returns:
        list of str: Words with football player names normalized to 'footballplayername'.
    """
    normalized_words = words.copy()
    
    # Get the list of player names for the given country
    player_last_names = football_player_data.get(country, [])
    
    # Replace individual football player last names
    for i, word in enumerate(words):
        if word in player_last_names:
            normalized_words[i] = 'footballplayername'
    
    return normalized_words

def preprocess_text(text, hometeam, awayteam):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if not word.startswith('http')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Apply country normalization if matchID is provided
    words = normalize_countries_with_teams(words, hometeam, awayteam)
    
    return ' '.join(words)

# Function to map team names to country codes
def map_team_to_country_code(team_name, country_code_mapping):
    return country_code_mapping.get(team_name, "unknown")


def remove_player_names(row, football_player_data):
    """
    Replace player names in the tweet with 'footballplayername'.
    """    
    # Tokenize the tweet
    words = row['Tweet'].split()
    
    # Normalize player names
    normalized_words = [
        'footballplayername' if word in football_player_data else word for word in words
    ]
    
    return ' '.join(normalized_words)



In [None]:
football_player_data = pd.read_pickle('grouped_football_player_last_names.pkl')
print(football_player_data)

{'ad': ['alavedra', 'alvarez'], 'ae': ['a', 'abaelaziz', 'abbas', 'abdalla', 'abdulbasit', 'abdulla', 'abdulrahman', 'adel', 'ahmad', 'ahmed', 'al-baloushi', 'aldhanhani', 'alghassani', 'alhammadi', 'alharbi', 'ali', 'aljasmi', 'almansoori', 'almentheri', 'almheiri', 'alshamsi', 'alzaabi', 'ameri', 'ammar', 'atiq', 'attas', 'autonne', 'aydh', 'ayman', 'azizi', 'baloushi', 'balushi', 'bandar', 'barman', 'bawazir', 'bilal', 'blooshi', 'butti', 'caio', 'dhawi', 'esam', 'essa', 'fadaq', 'fahad', 'fawzi', 'fuad', 'ghazy', 'hammadi', 'hamza', 'hashemi', 'hassan', 'hosani', 'hussain', 'ibrahim', 'idrees', 'issam', 'jaafar', 'jaber', 'jamil', 'juma', 'kaabi', 'kaidi', 'karbi', 'khairi', 'khalfan', 'khalil', 'khamis', 'khaseif', 'lashkari', 'lima', 'luanzinho', 'mabkhout', 'mahmoud', 'marzooq', 'matar', 'mazami', 'menhali', 'meqebaali', 'mohamad', 'mohamed', 'mohammad', 'mubarak', 'muhsin', 'murad', 'nader', 'naqbi', 'naser', 'rabeeh', 'ramadan', 'rashid', 'saadi', 'saeed', 'saleh', 'salem', 's

In [None]:
#####PREPROCESSING TRAIN DATA AND SAVING IT#####

# # Directories
input_folder = "train_tweets"
output_folder = "train_tweets_preprocessed"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Extract teams from filename
    filename = os.path.basename(file_path)
    home_team, away_team = extract_teams_from_filename(filename) 
    current_df['HomeTeam'] = home_team
    current_df['AwayTeam'] = away_team
    
    # Preprocess the tweets
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: preprocess_text(row['Tweet'], home_team, away_team), axis=1
    )

    # Update HomeTeam and AwayTeam to their corresponding country codes
    current_df['HomeTeamCode'] = current_df['HomeTeam'].swifter.apply(lambda x: map_team_to_country_code(x, country_code_mapping))
    current_df['AwayTeamCode'] = current_df['AwayTeam'].swifter.apply(lambda x: map_team_to_country_code(x, country_code_mapping))
    
    home_team_code = current_df['HomeTeamCode'].iloc[0]
    away_team_code = current_df['AwayTeamCode'].iloc[0]
    
    home_team_players = football_player_data.get(home_team_code, [])
    away_team_players = football_player_data.get(away_team_code, [])

    combined_player_names = home_team_players + away_team_players

    current_df['Tweet'] = current_df.swifter.apply(lambda row: remove_player_names(row, combined_player_names), axis=1)
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)

Pandas Apply:   0%|          | 0/86843 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/86843 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/86843 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/86843 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/272389 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/272389 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/272389 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/272389 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/148298 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/148298 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/148298 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/148298 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/973985 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/973985 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/973985 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/973985 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/99192 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/99192 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/99192 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/99192 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/95108 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/95108 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/95108 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/95108 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/712525 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/712525 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/712525 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/712525 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/525725 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/525725 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/525725 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/525725 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/155549 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/155549 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/155549 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/155549 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/367899 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/367899 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/367899 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/367899 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/96834 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/96834 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/96834 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/96834 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/41539 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/41539 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/41539 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/41539 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/824241 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/824241 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/824241 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/824241 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/313803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/313803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/313803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/313803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/85675 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/85675 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/85675 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/85675 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/256445 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/256445 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/256445 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/256445 [00:00<?, ?it/s]

*PREPROCESSING ALL DATA AND SAVING IT*

Use this to make weights and tfidf, Could be more efficient. But eitherway nice to have preprocessed df saved.


In [25]:
#####PREPROCESSING EVAL DATA AND SAVING IT#####

# # Directories
input_folder = "eval_tweets"
output_folder = "eval_tweets_preprocessed"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Extract teams from filename
    filename = os.path.basename(file_path)
    home_team, away_team = extract_teams_from_filename(filename) 
    current_df['HomeTeam'] = home_team
    current_df['AwayTeam'] = away_team
    
    # Preprocess the tweets
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: preprocess_text(row['Tweet'], home_team, away_team), axis=1
    )

    # Update HomeTeam and AwayTeam to their corresponding country codes
    current_df['HomeTeamCode'] = current_df['HomeTeam'].swifter.apply(lambda x: map_team_to_country_code(x, country_code_mapping))
    current_df['AwayTeamCode'] = current_df['AwayTeam'].swifter.apply(lambda x: map_team_to_country_code(x, country_code_mapping))
    
    home_team_code = current_df['HomeTeamCode'].iloc[0]
    away_team_code = current_df['AwayTeamCode'].iloc[0]
    
    home_team_players = football_player_data.get(home_team_code, [])
    away_team_players = football_player_data.get(away_team_code, [])

    combined_player_names = home_team_players + away_team_players

    current_df['Tweet'] = current_df.swifter.apply(lambda row: remove_player_names(row, combined_player_names), axis=1)
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)

Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/45024 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/45024 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/45024 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/45024 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/628698 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/628698 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/628698 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/628698 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/113402 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/113402 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/113402 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/113402 [00:00<?, ?it/s]

In [121]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
MMscaler = MinMaxScaler()

# Directory containing the preprocessed CSV files
input_folder_train = "train_tweets_preprocessed_no_player"
input_folder_eval = "eval_tweets_preprocessed_no_player"


# List all CSV files in the input folder
csv_files_train = [os.path.join(input_folder_train, f) for f in os.listdir(input_folder_train) if f.endswith(".csv")]
csv_files_eval = [os.path.join(input_folder_eval, f) for f in os.listdir(input_folder_eval) if f.endswith(".csv")]

# Load each CSV file and concatenate them into one DataFrame
dataframes_train = [pd.read_csv(file) for file in csv_files_train]
dataframes_eval = [pd.read_csv(file) for file in csv_files_eval]

combined_train_df = pd.concat(dataframes_train, ignore_index=True)
combined_eval_df = pd.concat(dataframes_eval, ignore_index=True)

# Feature creation



loaded_df = pd.read_pickle('preprocessed_train.pkl')

# Add the length of each tweet as a feature
combined_train_df['TweetLength'] = combined_train_df['Tweet'].apply(len)
combined_train_df['TweetCount'] = combined_train_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
combined_train_df['WordCount'] = combined_train_df['Tweet'].apply(lambda x: len(x.split()))

combined_eval_df['TweetLength'] = combined_eval_df['Tweet'].apply(len)
combined_eval_df['TweetCount'] = combined_eval_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
combined_eval_df['WordCount'] = combined_eval_df['Tweet'].apply(lambda x: len(x.split()))


combined_train_df.to_pickle('preprocessed_train.pkl')
combined_eval_df.to_pickle('preprocessed_eval.pkl')


In [122]:
# Load the two pickle files into separate DataFrames
train_df = pd.read_pickle('preprocessed_train.pkl')
eval_df = pd.read_pickle('preprocessed_eval.pkl')

# Concatenate the DataFrames
full_df = pd.concat([train_df, eval_df], ignore_index=True)

# Save the combined DataFrame as a new pickle file
full_df.to_pickle('preprocessed_full.pkl')

In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
# Compute TF-IDF weights for the corpus

# Optional: start from zero and fit on tweets
# Initialize TF-IDF Vectorizer
vectorizer_all_tweets = TfidfVectorizer(
    max_features=15000,           # Limit vocabulary size to 10,000 terms
    min_df=3,                     # Ignore terms in fewer than 3 documents
    #max_df=0.90,                  # Ignore overly frequent terms
    sublinear_tf=True,            # Apply logarithmic scaling to term frequencies     
    norm='l2',                    # L2 normalization
) 

# Fit the TF-IDF vectorizer on minute-level documents
vectorizer_all_tweets.fit(full_df['Tweet'])

# Save the vectorizer
with open("tfidf_vectorizer_all_tweets.pkl", "wb") as f:
    pickle.dump(vectorizer_all_tweets, f)

# Load pre-computed
with open('tfidf_vectorizer_all_tweets.pkl', 'rb') as f:
   vectorizer_all_tweets = pickle.load(f)

# Extract TF-IDF weights
tfidf_weights_all_tweets = dict(zip(vectorizer_all_tweets.get_feature_names_out(), vectorizer_all_tweets.idf_))


# Weighted average embeddings
def get_weighted_avg_embedding(tweet, model, vector_size=200, weights=tfidf_weights_all_tweets):
    words = tweet.split()
    word_vectors = [model[word] * weights.get(word, 1) for word in words if word in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [125]:
# Generate embeddings for each tweet
loaded_df = pd.read_pickle('preprocessed_train.pkl')

# vector_size = 200  # GloVe embedding dimension
tweet_vectors = loaded_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights_all_tweets))
tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

# Save the tweet vectors
with open("tweet_vectors_all_data.pkl", "wb") as f:
    pickle.dump(tweet_vectors, f)

print("Embeddings saved successfully!")

# Load the tweet vectors
with open("tweet_vectors_all_data.pkl", "rb") as f:
    loaded_tweet_vectors = pickle.load(f)

print("Embeddings loaded successfully!")
print("Loaded vectors shape:", loaded_tweet_vectors.shape)



Pandas Apply:   0%|          | 0/5312495 [00:00<?, ?it/s]

Embeddings saved successfully!
Embeddings loaded successfully!
Loaded vectors shape: (5312495, 200)


In [126]:
###### Use if no period features ######
tweet_df = pd.DataFrame(loaded_tweet_vectors)

# Attach the vectors into the original dataframe
period_features = pd.concat([loaded_df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet', 'HomeTeam', 'AwayTeam', 'HomeTeamCode', 'AwayTeamCode'])

print("X_train_reshaped shape:", period_features.shape)
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

# Save the tweet vectors
with open("period_features.pkl", "wb") as f:
    pickle.dump(period_features, f)

print("Period features saved successfully!")

# Load the tweet vectors
with open("period_features.pkl", "rb") as f:
    loaded_period_features = pickle.load(f)

print("Period features loaded successfully!")
print("Loaded vectors shape:", loaded_period_features.shape)

X_train_reshaped shape: (5312495, 207)
Period features saved successfully!
Period features loaded successfully!
Loaded vectors shape: (2137, 207)


In [127]:
# We drop the non-numerical features and keep the embeddings values for each period
X = loaded_period_features.drop(columns=['EventType', 'MatchID', 'ID']).values
# We extract the labels of our training samples
y = loaded_period_features['EventType'].values


# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=SEED)

# Add a time step dimension to match the LSTM input shape
X_train_reshaped = X_train[:, None, :]  # Add a new axis for timesteps
X_test_reshaped = X_test[:, None, :]    # Add a new axis for timesteps

In [128]:


# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',      # Monitor validation loss
    patience=5,              # Stop training if no improvement after 3 epochs
    restore_best_weights=True  # Restore the best weights when stopping
)

# Define the LSTM model with deterministic initializers
model = Sequential([
    tf.keras.layers.Input(shape=(1, X_train_reshaped.shape[2])),  
    LSTM(
        128, 
        return_sequences=False, 
        kernel_initializer=GlorotUniform(seed=SEED), 
        recurrent_initializer=Orthogonal(seed=SEED),
        bias_initializer='zeros'
    ),             
    Dense(y_encoded.shape[1], activation='softmax', kernel_initializer=GlorotUniform(seed=SEED))
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping],  # Include the early stopping callback
                    verbose=1)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=1)

print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5974 - loss: 0.6828 - val_accuracy: 0.5439 - val_loss: 0.6879
Epoch 2/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6110 - loss: 0.6495 - val_accuracy: 0.5906 - val_loss: 0.6667
Epoch 3/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6471 - loss: 0.6249 - val_accuracy: 0.6111 - val_loss: 0.6514
Epoch 4/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6589 - loss: 0.5988 - val_accuracy: 0.6520 - val_loss: 0.6339
Epoch 5/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6832 - loss: 0.5786 - val_accuracy: 0.6725 - val_loss: 0.6219
Epoch 6/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7162 - loss: 0.5630 - val_accuracy: 0.6842 - val_loss: 0.6157
Epoch 7/50
[1m43/43[0m [32m━━━━━━━━━

In [None]:
eval_data = pd.read_pickle('preprocessed_eval.pkl')
print(eval_data)

In [131]:
###### For Kaggle submission

predictions = []
dummy_predictions = []
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.

val_df = pd.read_pickle('preprocessed_eval.pkl')

tweet_vectors = val_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights= tfidf_weights_all_tweets))

tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

tweet_df = pd.DataFrame(tweet_vectors)

period_features_val = pd.concat([val_df, tweet_df], axis=1)
period_features_val = period_features_val.drop(columns=['Timestamp', 'Tweet', 'HomeTeam', 'AwayTeam', 'HomeTeamCode', 'AwayTeamCode'])
period_features_val = period_features_val.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

X = period_features_val.drop(columns=['MatchID', 'ID']).values

# Reshape input for LSTM
X_reshaped = X[:, None, :]  # Add timestep dimension

preds = model.predict(X_reshaped)
preds = preds.argmax(axis=1)  # Convert probabilities to class indices
period_features_val['EventType'] = preds
predictions.append(period_features_val[['ID', 'EventType']])


pred_df = pd.concat(predictions)
pred_df.to_csv('LSTM_predictions.csv', index=False)



Pandas Apply:   0%|          | 0/1072928 [00:00<?, ?it/s]

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step


In [None]:
import pickle
# Save TF-IDF trained on the training data
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
print(val_df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

# Load dataset
loaded_df = pd.read_pickle('preprocessed_train.pkl')

# Single vectorizer for consistent vocabulary
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(loaded_df['Tweet'])

# Separate event and non-event tweets
event_X = X[loaded_df['EventType'] == 1]
non_event_X = X[loaded_df['EventType'] == 0]

# Compute mean TF-IDF scores for sparse matrices
event_tfidf_scores = np.array(event_X.mean(axis=0)).flatten()
non_event_tfidf_scores = np.array(non_event_X.mean(axis=0)).flatten()

# Get feature names
tfidf_words = vectorizer.get_feature_names_out()

# Compute differences and sort
tfidf_differences = event_tfidf_scores - non_event_tfidf_scores
tfidf_word_differences = sorted(zip(tfidf_words, tfidf_differences), key=lambda x: abs(x[1]), reverse=True)

# Output top words
print("Top words by TF-IDF difference:")
print(tfidf_word_differences[:10])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import pandas as pd

# Convert tweets into a bag-of-words representation
vectorizer = CountVectorizer(max_features=5000, stop_words='english', binary=True)
X = vectorizer.fit_transform(loaded_df['Tweet'])  # Word presence (binary matrix)
y = loaded_df['EventType']  # Binary labels: 1 (event), 0 (no event)

# Perform chi-square test
chi2_scores, p_values = chi2(X, y)

# Create a DataFrame of words and chi-square scores
chi2_results = pd.DataFrame({
    'Word': vectorizer.get_feature_names_out(),
    'Chi2_Score': chi2_scores,
    'P_Value': p_values
}).sort_values(by='Chi2_Score', ascending=False)

# Display the top 10 words most associated with events
print("Top event-specific words by Chi2 score:")
print(chi2_results.head(10))

In [None]:
name_list = [
    'messi', 'ronaldo', 'neymar', 'beckham', 'villa', 'modric', 'suarez', 'griezmann',
    'mbappe', 'kroos', 'kane', 'iniesta', 'xavi', 'pogba', 'drogba', 'gerard', 'hummels',
    'ribery', 'salah', 'hazard', 'benzema', 'aguero', 'zlatan', 'kaka', 'rooney', 'degea',
    'ozil', 'bale', 'schweinsteiger', 'pirlo', 'terry', 'alves', 'pique', 'david', 'cristiano', 
    'kramer', 'klose', 'muller', 'lahm', 'neuer', 'gotze', 'reus', 'schurrle', 'kroos', 'boateng',
    'cromex', 'chicharito', 'torres', 'leroy', 'hernandez', 'christoph', 'vertonghen', 'john',
    'javier', 'fernandez', 'slimani', 'matshummels', 'romero', 'porgha', 'marquez', 'guajevilla',
    'honsui', 'cahill', 'neymarjr', 'porgha'
]


def remove_rows_with_names(df, name_list):
    # Filter the DataFrame to exclude rows where the "Word" column matches any name in the name list
    return df[~df['Word'].str.lower().isin(name_list)]


In [None]:
chi2_results = remove_rows_with_names(chi2_results, name_list)



In [None]:
print(chi2_results)

In [None]:
top_words = chi2_results[:62]

In [None]:
# Create a dictionary for the chi² scores
chi2_dict = dict(zip(top_words['Word'], top_words['Chi2_Score']))

# Compute weighted score features for each important word
for word in chi2_dict.keys():
    loaded_df[f'WeightedScore_{word}'] = loaded_df['Tweet'].apply(
        lambda tweet: tweet.split().count(word) * chi2_dict.get(word, 0)
    )

print("loaded_df shape:", loaded_df.shape)


In [None]:
loaded_df.to_pickle('preprocessed_train_extra_weights.pkl')

In [None]:
correlation = loaded_df[['TweetLength', 'WordCount']].corr()
print("Correlation between TweetLength and WordCount:")
print(correlation)


In [None]:
import matplotlib.pyplot as plt

plt.scatter(loaded_df['TweetLength'], loaded_df['WordCount'], alpha=0.5)
plt.xlabel('TweetLength')
plt.ylabel('WordCount')
plt.title('Relationship Between Tweet Length and Word Count')
plt.show()

In [None]:
# Average word length in a tweet
loaded_df['AvgWordLength'] = loaded_df['TweetLength'] / (loaded_df['WordCount'] + 1)  # Add 1 to avoid division by zero


In [81]:
import os
import pandas as pd
import re
import swifter

# Directories
folders = ["train_tweets", "eval_tweets"]

# Function to extract mentions, excluding those from retweets
def extract_mentions(text):
    # Find all mentions in the text
    mentions = re.findall(r'@\w+', text)
    # Exclude mentions in the format "RT @NAME:"
    filtered_mentions = [
        mention for mention in mentions if not re.search(rf'^RT {mention}:', text)
    ]
    return filtered_mentions

# List to store all mentions with HomeTeam and AwayTeam
all_mentions_data = []

# Process each file in the specified folders
for folder in folders:
    csv_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".csv")]
    for file_path in csv_files:
        # Extract teams from filename
        filename = os.path.basename(file_path)
        home_team, away_team = extract_teams_from_filename(filename)  # Assumes this function exists
        
        # Load the CSV
        current_df = pd.read_csv(file_path)
        
        # Add HomeTeam and AwayTeam to DataFrame
        current_df['HomeTeam'] = home_team
        current_df['AwayTeam'] = away_team
        
        # Extract mentions from each tweet using swifter
        current_df['Mentions'] = current_df['Tweet'].swifter.apply(extract_mentions)
        
        # Flatten mentions and include HomeTeam and AwayTeam for each mention
        all_mentions_data.extend(
            {
                'Mention': mention,
                'HomeTeam': row['HomeTeam'],
                'AwayTeam': row['AwayTeam']
            }
            for _, row in current_df.iterrows() for mention in row['Mentions']
        )

# Create a DataFrame from the mentions data
mentions_df = pd.DataFrame(all_mentions_data)

# Save the DataFrame as a pickle file
mentions_df.to_pickle("mentions_with_teams.pkl")

print("Mentions with teams saved to mentions_with_teams.pkl")


Pandas Apply:   0%|          | 0/86843 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/272389 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/148298 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/973985 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/99192 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/95108 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/712525 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/525725 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/155549 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/367899 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/96834 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/41539 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/824241 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/313803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/85675 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/256445 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/45024 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/628698 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/113402 [00:00<?, ?it/s]

Mentions with teams saved to mentions_with_teams.pkl


In [82]:
print(mentions_df)

               Mention   HomeTeam    AwayTeam
0          @theadamccx  Australia       Spain
1           @OSAussies  Australia       Spain
2           @StatsZone  Australia       Spain
3           @Socceroos  Australia       Spain
4          @VillasArmy  Australia       Spain
...                ...        ...         ...
1142201         @Deen8     Greece  IvoryCoast
1142202  @Naijablogger     Greece  IvoryCoast
1142203     @jayayensu     Greece  IvoryCoast
1142204   @bobbykemp81     Greece  IvoryCoast
1142205  @Naijablogger     Greece  IvoryCoast

[1142206 rows x 3 columns]


In [100]:
import pandas as pd
import re
import swifter

# Load the mentions DataFrame
mentions = pd.read_pickle("mentions_with_teams.pkl")

# Filter mentions based on their occurrences in a specific HomeTeam and AwayTeam combination
mentions['mention_count'] = mentions.groupby(['HomeTeam', 'AwayTeam', 'Mention'])['Mention'].transform('count')
mentions = mentions[mentions['mention_count'] > 10]

# Drop duplicates
mentions = mentions.drop_duplicates()

# Load the last names DataFrame
last_names_df = pd.read_csv("last_names.csv")

# Ensure the last names DataFrame contains 'last_name' and 'team' columns
if {'last_name', 'nationality'}.issubset(last_names_df.columns):
    # Add the 'is_name' column
    def is_name(mention, home_team, away_team):
        # Filter last names for the HomeTeam and AwayTeam
        team_last_names = last_names_df[last_names_df['nationality'].isin([home_team, away_team])]['last_name'].str.lower().tolist()
        # Check if any last name from these teams matches the mention
        return 1 if any(last_name in mention.lower() for last_name in team_last_names) else 0

    # Apply the function row-wise using swifter
    mentions['is_name'] = mentions.swifter.apply(
        lambda row: is_name(row['Mention'], row['HomeTeam'], row['AwayTeam']), axis=1
    )

    # Drop the temporary mention_count column
    mentions.drop(columns=['mention_count'], inplace=True)

    # Print the updated DataFrame
    print(mentions)

    # Optionally, save the updated DataFrame to a new pickle file
    mentions.to_pickle("mentions_with_is_name.pkl")

else:
    print("The last_names.csv file must contain 'last_name' and 'team' columns.")


Pandas Apply:   0%|          | 0/7799 [00:00<?, ?it/s]

                Mention   HomeTeam    AwayTeam  is_name
3            @Socceroos  Australia       Spain        0
5        @FutballTweets  Australia       Spain        0
6             @PReina25  Australia       Spain        1
11         @FutbolBible  Australia       Spain        0
12               @21LVA  Australia       Spain        0
...                 ...        ...         ...      ...
1137718  @Vine_Football     Greece  IvoryCoast        0
1137822      @idriselba     Greece  IvoryCoast        0
1137965     @PhilaUnion     Greece  IvoryCoast        0
1139110    @SeppBlatter     Greece  IvoryCoast        0
1139786             @AP     Greece  IvoryCoast        0

[7799 rows x 4 columns]


In [101]:
mentions_with_is_name = pd.read_pickle("mentions_with_is_name.pkl")
mentions_with_is_name = mentions_with_is_name
print(mentions_with_is_name)

                Mention   HomeTeam    AwayTeam  is_name
3            @Socceroos  Australia       Spain        0
5        @FutballTweets  Australia       Spain        0
6             @PReina25  Australia       Spain        1
11         @FutbolBible  Australia       Spain        0
12               @21LVA  Australia       Spain        0
...                 ...        ...         ...      ...
1137718  @Vine_Football     Greece  IvoryCoast        0
1137822      @idriselba     Greece  IvoryCoast        0
1137965     @PhilaUnion     Greece  IvoryCoast        0
1139110    @SeppBlatter     Greece  IvoryCoast        0
1139786             @AP     Greece  IvoryCoast        0

[7799 rows x 4 columns]


In [107]:
import pandas as pd

# Load the DataFrame
mentions_with_is_name = pd.read_pickle("mentions_with_is_name.pkl")

# Convert mentions to lowercase
mentions_with_is_name['Mention'] = mentions_with_is_name['Mention']

mentions_with_is_name = mentions_with_is_name[mentions_with_is_name['is_name'] != 0]


mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('Brazil', na=False)]
mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('soccer', na=False)]
mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('ball', na=False)]
mentions_with_is_name = mentions_with_is_name[~mentions_with_is_name['Mention'].str.contains('Twitter', na=False)]



# Print the updated DataFrame
print(mentions_with_is_name)

# Optionally, save the updated DataFrame to a new pickle file
mentions_with_is_name.to_pickle("mentions_without_football.pkl")

                  Mention     HomeTeam    AwayTeam  is_name
6               @PReina25    Australia       Spain        1
14           @Guaje7Villa    Australia       Spain        1
57            @19SCazorla    Australia       Spain        1
59         @BenHalloran23    Australia       Spain        1
70         @oliverbozanic    Australia       Spain        1
...                   ...          ...         ...      ...
1088958     @ShawnBerrios  Netherlands      Mexico        1
1096603  @AnthonyIbarra10  Netherlands      Mexico        1
1111115        @LaRAWRura  Netherlands      Mexico        1
1128712  @KoloKolotoure28       Greece  IvoryCoast        1
1129388  @GervinhOfficial       Greece  IvoryCoast        1

[564 rows x 4 columns]


In [116]:
mentions_with_is_name = pd.read_pickle("mentions_without_football.pkl")
mentions_with_is_name['PreprocessedMention'] = mentions_with_is_name.swifter.apply(
    lambda row: preprocess_text(row['Mention'], row['HomeTeam'], row['AwayTeam']), axis=1
)
mentions_with_is_name.to_pickle("mentions_processed.pkl")
print(mentions_with_is_name)

Pandas Apply:   0%|          | 0/564 [00:00<?, ?it/s]

                  Mention     HomeTeam    AwayTeam  is_name  \
6               @PReina25    Australia       Spain        1   
14           @Guaje7Villa    Australia       Spain        1   
57            @19SCazorla    Australia       Spain        1   
59         @BenHalloran23    Australia       Spain        1   
70         @oliverbozanic    Australia       Spain        1   
...                   ...          ...         ...      ...   
1088958     @ShawnBerrios  Netherlands      Mexico        1   
1096603  @AnthonyIbarra10  Netherlands      Mexico        1   
1111115        @LaRAWRura  Netherlands      Mexico        1   
1128712  @KoloKolotoure28       Greece  IvoryCoast        1   
1129388  @GervinhOfficial       Greece  IvoryCoast        1   

        PreprocessedMention  
6                    preina  
14               guajevilla  
57                 scazorla  
59              benhalloran  
70            oliverbozanic  
...                     ...  
1088958        shawnberrios  
109

In [119]:
##### PREPROCESSING TRAIN DATA AND SAVING IT #####
import os
import pandas as pd
import swifter

# Load mentions with preprocessed mentions
mentions_with_is_name = pd.read_pickle("mentions_processed.pkl")
combined_player_names = set(mentions_with_is_name["PreprocessedMention"].dropna().unique())  # Use set for faster lookup

# Directories
input_folder = "train_tweets_preprocessed"
output_folder = "train_tweets_preprocessed_no_player"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Define the remove_player_names function
def remove_player_names(row, football_player_data):
    """
    Replace player names in the tweet with 'footballplayername'.
    """
    if not isinstance(row['Tweet'], str):  # Ensure Tweet is a string
        return row['Tweet']
    
    words = row['Tweet'].split()  # Split tweet into words
    normalized_words = [
        'footballplayername' if word in football_player_data else word for word in words
    ]
    return ' '.join(normalized_words)

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    print(f"Processing: {file_path}")
    current_df = pd.read_csv(file_path)
    
    if 'Tweet' not in current_df.columns:
        print(f"Skipping file {file_path} (missing 'Tweet' column).")
        continue

    # Apply the remove_player_names function
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: remove_player_names(row, combined_player_names), axis=1
    )
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")


Processing: train_tweets_preprocessed/AustraliaSpain34.csv


Pandas Apply:   0%|          | 0/86843 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/AustraliaSpain34.csv
Processing: train_tweets_preprocessed/PortugalGhana58.csv


Pandas Apply:   0%|          | 0/272389 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/PortugalGhana58.csv
Processing: train_tweets_preprocessed/CameroonBrazil36.csv


Pandas Apply:   0%|          | 0/148298 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/CameroonBrazil36.csv
Processing: train_tweets_preprocessed/GermanyBrazil74.csv


Pandas Apply:   0%|          | 0/973985 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/GermanyBrazil74.csv
Processing: train_tweets_preprocessed/BelgiumSouthKorea59.csv


Pandas Apply:   0%|          | 0/99192 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/BelgiumSouthKorea59.csv
Processing: train_tweets_preprocessed/NetherlandsChile35.csv


Pandas Apply:   0%|          | 0/95108 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/NetherlandsChile35.csv
Processing: train_tweets_preprocessed/GermanyAlgeria67.csv


Pandas Apply:   0%|          | 0/712525 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/GermanyAlgeria67.csv
Processing: train_tweets_preprocessed/FranceGermany70.csv


Pandas Apply:   0%|          | 0/525725 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/FranceGermany70.csv
Processing: train_tweets_preprocessed/MexicoCroatia37.csv


Pandas Apply:   0%|          | 0/155549 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/MexicoCroatia37.csv
Processing: train_tweets_preprocessed/FranceNigeria66.csv


Pandas Apply:   0%|          | 0/367899 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/FranceNigeria66.csv
Processing: train_tweets_preprocessed/AustraliaNetherlands29.csv


Pandas Apply:   0%|          | 0/96834 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/AustraliaNetherlands29.csv
Processing: train_tweets_preprocessed/HondurasSwitzerland54.csv


Pandas Apply:   0%|          | 0/41539 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/HondurasSwitzerland54.csv
Processing: train_tweets_preprocessed/ArgentinaGermanyFinal77.csv


Pandas Apply:   0%|          | 0/824241 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/ArgentinaGermanyFinal77.csv
Processing: train_tweets_preprocessed/ArgentinaBelgium72.csv


Pandas Apply:   0%|          | 0/313803 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/ArgentinaBelgium72.csv
Processing: train_tweets_preprocessed/USASlovenia2010.csv


Pandas Apply:   0%|          | 0/85675 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/USASlovenia2010.csv
Processing: train_tweets_preprocessed/GermanyUSA57.csv


Pandas Apply:   0%|          | 0/256445 [00:00<?, ?it/s]

Saved: train_tweets_preprocessed_no_player/GermanyUSA57.csv


In [120]:
# # Directories

##### PREPROCESSING TRAIN DATA AND SAVING IT #####
import os
import pandas as pd
import swifter

# Load mentions with preprocessed mentions
mentions_with_is_name = pd.read_pickle("mentions_processed.pkl")
combined_player_names = set(mentions_with_is_name["PreprocessedMention"].dropna().unique())  # Use set for faster lookup

# Directories
input_folder = "eval_tweets_preprocessed"
output_folder = "eval_tweets_preprocessed_no_player"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Define the remove_player_names function
def remove_player_names(row, football_player_data):
    """
    Replace player names in the tweet with 'footballplayername'.
    """
    if not isinstance(row['Tweet'], str):  # Ensure Tweet is a string
        return row['Tweet']
    
    words = row['Tweet'].split()  # Split tweet into words
    normalized_words = [
        'footballplayername' if word in football_player_data else word for word in words
    ]
    return ' '.join(normalized_words)

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    print(f"Processing: {file_path}")
    current_df = pd.read_csv(file_path)
    
    if 'Tweet' not in current_df.columns:
        print(f"Skipping file {file_path} (missing 'Tweet' column).")
        continue

    # Apply the remove_player_names function
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: remove_player_names(row, combined_player_names), axis=1
    )
    
    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, os.path.basename(file_path))
    current_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")


Processing: eval_tweets_preprocessed/GermanyGhana32.csv


Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

Saved: eval_tweets_preprocessed_no_player/GermanyGhana32.csv
Processing: eval_tweets_preprocessed/GermanySerbia2010.csv


Pandas Apply:   0%|          | 0/45024 [00:00<?, ?it/s]

Saved: eval_tweets_preprocessed_no_player/GermanySerbia2010.csv
Processing: eval_tweets_preprocessed/NetherlandsMexico64.csv


Pandas Apply:   0%|          | 0/628698 [00:00<?, ?it/s]

Saved: eval_tweets_preprocessed_no_player/NetherlandsMexico64.csv
Processing: eval_tweets_preprocessed/GreeceIvoryCoast44.csv


Pandas Apply:   0%|          | 0/113402 [00:00<?, ?it/s]

Saved: eval_tweets_preprocessed_no_player/GreeceIvoryCoast44.csv
