In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
import re
import swifter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform, Orthogonal
import random
# Enable tqdm for pandas
tqdm.pandas()

# Ensure Reproducibility
import random
# Set seeds for reproducibility
SEED = 42

# Python's built-in random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Set Python hash seed
os.environ['PYTHONHASHSEED'] = str(SEED)

# Configure TensorFlow for deterministic operations
tf.keras.utils.set_random_seed(SEED)  # Sets all random seeds for the program (Python, NumPy, and TensorFlow)
tf.config.experimental.enable_op_determinism()  # Enable deterministic operations in TensorFlow

# If using GPU, you might also want to set these:
if tf.config.list_physical_devices('GPU'):
    # Force TensorFlow to use deterministic GPU operations
    tf.config.experimental.enable_op_determinism()
    # Limit GPU memory growth
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Limit to one GPU if using multiple GPUs
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

2024-12-11 13:28:40.593488: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load GloVe model
glove_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

In [3]:
# Country list for team extraction
country_list = [
    "Argentina", "Belgium", "Germany", "Serbia", "Greece", "IvoryCoast", 
    "Netherlands", "Mexico", "Australia", "Spain", "SouthKorea", 
    "Cameroon", "Brazil", "France", "Nigeria", "Algeria", "USA", 
    "Honduras", "Switzerland", "Croatia", "Chile", "Portugal", 
    "Ghana", "Slovenia"
]

country_variations = {
    'argentina': ['argentina', 'arg', 'argentine', 'argies', 'albiceleste', 'argentinian', 'argentinos', 'argentinas'],
    'australia': ['australia', 'aus', 'aussie', 'aussies', 'socceroos', 'oz', 'straya', 'australian', 'au'],
    'belgium': ['belgium', 'bel', 'belgique', 'belgie', 'belgian', 'belgians', 'red devils', 'diables rouges'],
    'brazil': ['brazil', 'bra', 'brasil', 'bresil', 'brazilian', 'brazilians', 'selecao', 'canarinho', 'verde amarela', 'samba boys'],
    'cameroon': ['cameroon', 'cmr', 'cameroun', 'camerounais', 'indomitable lions', 'lions'],
    'france': ['france', 'fra', 'french', 'les bleus', 'tricolore', 'tricolores', 'equipe de france', 'allez les bleus'],
    'honduras': ['honduras', 'hon', 'honduran', 'hondurans', 'los catrachos', 'catrachos', 'la h'],
    'portugal': ['portugal', 'por', 'portuguese', 'selecao das quinas', 'seleccao', 'navegadores', 'team portugal'],
    'spain': ['spain', 'esp', 'espana', 'espania', 'spanish', 'la roja', 'furia roja', 'la furia', 'la seleccion'],
    'southkorea': ['south korea', 'korea', 'kor', 'skorea', 'korean', 'koreans', 'taeguk warriors', 'warriors'],
    'switzerland': ['switzerland', 'sui', 'suisse', 'schweiz', 'swiss', 'nati', 'rossocrociati', 'a team'],
    'usa': ['usa', 'united states', 'america', 'united states of america', 'us', 'usa', 'usmnt', 'americans', 'american', 'yanks', 'uncle sam', 'stars and stripes', 'team usa'],
    'ghana': ['ghana', 'gha', 'ghanaian', 'ghanaians', 'black stars', 'stars'],
    'netherlands': ['netherlands', 'ned', 'holland', 'dutch', 'oranje', 'flying dutchmen', 'orange', 'clockwork orange', 'nederlands'],
    'germany': ['germany', 'ger', 'alemania', 'deutschland', 'german', 'germans', 'die mannschaft', 'nationalelf', 'deu'],
    'iran': ['iran', 'irn', 'iranian', 'iranians', 'team melli', 'persian stars'],
    'nigeria': ['nigeria', 'nga', 'naija', 'super eagles', 'eagles', 'nigerian', 'nigerians', 'green eagles'],
    'algeria': ['algeria', 'alg', 'algerian', 'algerians', 'fennecs', 'desert foxes', 'les verts'],
    'croatia': ['croatia', 'cro', 'hrvatska', 'hrv', 'croatian', 'croatians', 'vatreni', 'blazers', 'kockasti'],
    'chile': ['chile', 'chi', 'chilean', 'chileans', 'la roja', 'team chile'],
    'slovenia': ['slovenia', 'svn', 'slovenian', 'slovenians', 'slovenski', 'boys'],
    'serbia': ['serbia', 'srb', 'serbian', 'serbians', 'beli orlovi', 'white eagles', 'orlovi'],
    'greece': ['greece', 'gre', 'greek', 'greeks', 'piratiko', 'ethniki', 'galanolefki'],
    'ivorycoast': ['ivory coast', 'civ', 'cote divoire', 'cotedivoire', 'ivorians', 'les elephants', 'elephants', 'ivory'],
    'mexico': ['mexico', 'mex', 'mexiko', 'mexican', 'mexicans', 'el tri', 'tricolor', 'aztecas', 'el tricolor', 'verde']
}



In [4]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

def extract_teams_from_filename(filename):
    # Remove numbers and file extension
    base_name = re.sub(r'\d+\.csv$', '', filename)
    
    # Identify teams from the predefined country list
    teams = [country for country in country_list if country in base_name]
    
    if len(teams) >= 2:
        return teams[0], teams[1]
    elif len(teams) == 1:
        return teams[0], "Unknown"
    else:
        return "Unknown", "Unknown"
    
def normalize_countries_with_teams(words, home_team, away_team):
    # Text is already preprocessed and split into words
    normalized_words = words.copy()
    
    # Replace individual country mentions
    for i, word in enumerate(words):
        # Check if it's home team
        if word in country_variations.get(home_team.lower(), []):
            normalized_words[i] = 'hometeam'
        # Check if it's away team
        elif word in country_variations.get(away_team.lower(), []):
            normalized_words[i] = 'awayteam'
        else:
            # Check if it's any other country
            for country, variations in country_variations.items():
                if word in variations and country.lower() not in [home_team.lower(), away_team.lower()]:
                    normalized_words[i] = 'othercountry'
                    break
    
    return normalized_words

def normalize_football_player_names_with_country(words, country, football_player_data):
    """
    Normalize football player names in a list of words to 'footballplayername', filtering by country.

    Args:
        words (list of str): List of words (preprocessed text).
        country (str): Country code of the players to focus on.
        football_player_data (dict): Dictionary mapping country codes to lists of last names.

    Returns:
        list of str: Words with football player names normalized to 'footballplayername'.
    """
    normalized_words = words.copy()
    
    # Get the list of player names for the given country
    player_last_names = football_player_data.get(country, [])
    
    # Replace individual football player last names
    for i, word in enumerate(words):
        if word in player_last_names:
            normalized_words[i] = 'footballplayername'
    
    return normalized_words

def preprocess_text(text, hometeam, awayteam):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if not word.startswith('http')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Apply country normalization if matchID is provided
    words = normalize_countries_with_teams(words, hometeam, awayteam)
    
    return ' '.join(words)

In [None]:
# Load data
folder_path = "train_tweets"
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]
# First load all data with team information
dataframes = []
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Get filename and extract teams
    filename = os.path.basename(file_path)
    home_team, away_team = extract_teams_from_filename(filename)
    
    # Add team columns
    current_df['HomeTeam'] = home_team
    current_df['AwayTeam'] = away_team
    
    dataframes.append(current_df)

# Concatenate all dataframes
df = pd.concat(dataframes, ignore_index=True)


# Then apply regular preprocessing with MatchID
df['Tweet'] = df.swifter.apply(lambda row: preprocess_text(row['Tweet'], row['HomeTeam'], row['AwayTeam']), axis=1)

df.to_pickle('preprocessed_train.pkl')

100%|██████████| 5056050/5056050 [11:03<00:00, 7622.41it/s] 


In [None]:
import pandas as pd
import pickle
from tqdm import tqdm
tqdm.pandas()  # For progress tracking

# Load the preprocessed last names grouped by country
grouped_last_names_pickle_path = 'grouped_football_player_last_names.pkl'
with open(grouped_last_names_pickle_path, 'rb') as pickle_file:
    football_player_data = pickle.load(pickle_file)
    

# Define the country code mapping
country_code_mapping = {
    "Argentina": "ar", "Belgium": "be", "Germany": "de", "Serbia": "rs", "Greece": "gr",
    "IvoryCoast": "ci", "Netherlands": "nl", "Mexico": "mx", "Australia": "au", "Spain": "es",
    "SouthKorea": "kr", "Cameroon": "cm", "Brazil": "br", "France": "fr", "Nigeria": "ng",
    "Algeria": "dz", "USA": "us", "Honduras": "hn", "Switzerland": "ch", "Croatia": "hr",
    "Chile": "cl", "Portugal": "pt", "Ghana": "gh", "Slovenia": "si"
}

# Load preprocessed tweets with HomeTeam and AwayTeam information
df = pd.read_pickle('preprocessed_train.pkl')

# Function to map team names to country codes
def map_team_to_country_code(team_name, country_code_mapping):
    return country_code_mapping.get(team_name, "unknown")

# Update HomeTeam and AwayTeam to their corresponding country codes
df['HomeTeamCode'] = df['HomeTeam'].apply(lambda x: map_team_to_country_code(x, country_code_mapping))
df['AwayTeamCode'] = df['AwayTeam'].apply(lambda x: map_team_to_country_code(x, country_code_mapping))

def remove_player_names(row, football_player_data):
    """
    Replace player names in the tweet with 'footballplayername'.
    """
    home_team_code = row['HomeTeamCode']
    away_team_code = row['AwayTeamCode']
    
    current_df['HomeTeam'] = home_team
    current_df['AwayTeam'] = away_team
    
    # Combine player names for home and away teams
    relevant_names = set(football_player_data.get(home_team_code, [])) | set(football_player_data.get(away_team_code, []))
    
    # Tokenize the tweet
    words = row['Tweet'].split()
    
    # Normalize player names
    normalized_words = [
        'footballplayername' if word in relevant_names else word for word in words
    ]
    
    return ' '.join(normalized_words)

# Apply the normalization to the tweets
df['Tweet'] = df.progress_apply(lambda row: remove_player_names(row, football_player_data), axis=1)

# Save the updated DataFrame
df.to_pickle('preprocessed_train_with_names_removed.pkl')


In [9]:
# Feature creation
from sklearn.preprocessing import MinMaxScaler

MMscaler = MinMaxScaler()


loaded_df = pd.read_pickle('preprocessed_train_with_names_removed.pkl')

# Add the length of each tweet as a feature
loaded_df['TweetLength'] = loaded_df['Tweet'].apply(len)

# Add a simple tweet count feature
loaded_df['TweetCount'] = loaded_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')

# Add word count as a feature
loaded_df['WordCount'] = loaded_df['Tweet'].apply(lambda x: len(x.split()))

loaded_df['TweetLength'] = MMscaler.fit_transform(loaded_df['TweetLength'].values.reshape(-1, 1))
# Add 1 to avoid division by zero






print(loaded_df.head())

    ID  MatchID  PeriodID  EventType      Timestamp  \
0  2_0        2         0          0  1403538600000   
1  2_0        2         0          0  1403538600000   
2  2_0        2         0          0  1403538600000   
3  2_0        2         0          0  1403538600000   
4  2_0        2         0          0  1403538600000   

                                               Tweet   HomeTeam AwayTeam  \
0  rt soccerdotcom awayteam beat hometeam well gi...  Australia    Spain   
1  visit sitep official web site spanis real stat...  Australia    Spain   
2  rt soccerdotcom awayteam beat hometeam well gi...  Australia    Spain   
3  rt worldsoccershop winner hometeam v awayteam ...  Australia    Spain   
4  rt soccerdotcom hometeam beat awayteam well gi...  Australia    Spain   

   AvgWordLength  TweetCount HomeTeamCode AwayTeamCode  TweetLength  WordCount  
0       0.072464           7           au           es     0.340136         16  
1       0.081781           7           au         

*PREPROCESSING ALL DATA AND SAVING IT*

Use this to make weights and tfidf, Could be more efficient. But eitherway nice to have preprocessed df saved.


In [13]:
#####PREPROCESSING ALL DATA AND SAVING IT#####

# # Directories
input_folder = "eval_tweets"
output_folder = "eval_tweets_preprocessed"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Process each file in the input folder
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
for file_path in csv_files:
    # Load the CSV
    current_df = pd.read_csv(file_path)
    
    # Extract teams from filename
    filename = os.path.basename(file_path)
    home_team, away_team = extract_teams_from_filename(filename)  # Use your existing function
    
    # Preprocess the tweets
    current_df['Tweet'] = current_df.swifter.apply(
        lambda row: preprocess_text(row['Tweet'], home_team, away_team), axis=1
    )
    
    # Update HomeTeam and AwayTeam to their corresponding country codes
    current_df['HomeTeamCode'] = current_df['HomeTeam'].apply(lambda x: map_team_to_country_code(x, country_code_mapping))
    current_df['AwayTeamCode'] = current_df['AwayTeam'].apply(lambda x: map_team_to_country_code(x, country_code_mapping))

    current_df['Tweet'] = current_df.progress_apply(lambda row: remove_player_names(row, football_player_data), axis=1)

    # Save the preprocessed data to the output folder
    output_path = os.path.join(output_folder, filename)
    current_df.to_csv(output_path, index=False)



Pandas Apply:   0%|          | 0/285804 [00:00<?, ?it/s]

KeyError: 'HomeTeam'

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# # Folder containing preprocessed tweets
# input_folder = "all_tweets_preprocessed"

# # Collect all tweets grouped by minute across all files
# tweet_documents = []

# # Process each preprocessed file
# csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".csv")]
# for file_path in csv_files:
#     # Load the preprocessed CSV
#     df_all_tweets = pd.read_csv(file_path)
#     tweet_documents.extend(df_all_tweets['Tweet'].tolist())
    

# # Compute TF-IDF weights for the corpus

# # Optional: start from zero and fit on tweets
# # Initialize TF-IDF Vectorizer
# vectorizer_all_tweets = TfidfVectorizer(
#     max_features=15000,           # Limit vocabulary size to 10,000 terms
#     min_df=3,                     # Ignore terms in fewer than 3 documents
#     #max_df=0.90,                  # Ignore overly frequent terms
#     sublinear_tf=True,            # Apply logarithmic scaling to term frequencies     
#     norm='l2',                    # L2 normalization
# ) # Adjust max_features if needed
# # Fit the TF-IDF vectorizer on minute-level documents
# vectorizer_all_tweets.fit(tweet_documents)

# # Save the vectorizer
# with open("tfidf_vectorizer_all_tweets.pkl", "wb") as f:
#     pickle.dump(vectorizer_all_tweets, f)

# Load pre-computed
with open('tfidf_vectorizer_all_tweets.pkl', 'rb') as f:
   vectorizer_all_tweets = pickle.load(f)

# Extract TF-IDF weights
tfidf_weights_all_tweets = dict(zip(vectorizer_all_tweets.get_feature_names_out(), vectorizer_all_tweets.idf_))


# Weighted average embeddings
def get_weighted_avg_embedding(tweet, model, vector_size=200, weights=tfidf_weights_all_tweets):
    words = tweet.split()
    word_vectors = [model[word] * weights.get(word, 1) for word in words if word in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [None]:
# # Generate embeddings for each tweet
# # vector_size = 200  # GloVe embedding dimension
# tweet_vectors = loaded_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights_all_tweets))
# tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

# # Save the tweet vectors
# with open("tweet_vectors_all_data.pkl", "wb") as f:
#     pickle.dump(tweet_vectors, f)

# print("Embeddings saved successfully!")

# Load the tweet vectors
with open("tweet_vectors_all_data.pkl", "rb") as f:
    loaded_tweet_vectors = pickle.load(f)

print("Embeddings loaded successfully!")
print("Loaded vectors shape:", loaded_tweet_vectors.shape)



In [None]:
###### Use if no period features ######
tweet_df = pd.DataFrame(loaded_tweet_vectors)

# Attach the vectors into the original dataframe
period_features = pd.concat([loaded_df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet', 'HomeTeam', 'AwayTeam', 'HomeTeamCode', 'AwayTeamCode'])

print("X_train_reshaped shape:", period_features.shape)
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

# Save the tweet vectors
with open("period_features.pkl", "wb") as f:
    pickle.dump(period_features, f)

print("Period features saved successfully!")

# Load the tweet vectors
with open("period_features.pkl", "rb") as f:
    loaded_period_features = pickle.load(f)

print("Period features loaded successfully!")
print("Loaded vectors shape:", loaded_period_features.shape)

In [None]:
# We drop the non-numerical features and keep the embeddings values for each period
X = loaded_period_features.drop(columns=['EventType', 'MatchID', 'ID']).values
# We extract the labels of our training samples
y = loaded_period_features['EventType'].values


# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=SEED)

# Add a time step dimension to match the LSTM input shape
X_train_reshaped = X_train[:, None, :]  # Add a new axis for timesteps
X_test_reshaped = X_test[:, None, :]    # Add a new axis for timesteps

In [None]:


# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',      # Monitor validation loss
    patience=5,              # Stop training if no improvement after 3 epochs
    restore_best_weights=True  # Restore the best weights when stopping
)

# Define the LSTM model with deterministic initializers
model = Sequential([
    tf.keras.layers.Input(shape=(1, X_train_reshaped.shape[2])),  
    LSTM(
        128, 
        return_sequences=False, 
        kernel_initializer=GlorotUniform(seed=SEED), 
        recurrent_initializer=Orthogonal(seed=SEED),
        bias_initializer='zeros'
    ),             
    Dense(y_encoded.shape[1], activation='softmax', kernel_initializer=GlorotUniform(seed=SEED))
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping],  # Include the early stopping callback
                    verbose=1)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=1)

print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
###### For Kaggle submission

predictions = []
dummy_predictions = []
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in sorted(os.listdir("eval_tweets")):
    val_df = pd.read_csv("eval_tweets/" + fname)
    
    home_team, away_team = extract_teams_from_filename(fname)
    val_df['HomeTeam'] = home_team
    val_df['AwayTeam'] = away_team
    
    val_df['Tweet'] = val_df.swifter.apply(lambda row: preprocess_text(row['Tweet'], row['HomeTeam'], row['AwayTeam']), axis=1)

    # Feature creation
    # Add the length of each tweet as a feature
    val_df['TweetLength'] = val_df['Tweet'].apply(len)
    
    # Add a simple tweet count feature
    val_df['TweetCount'] = val_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
    
    # Add word count as a feature
    val_df['WordCount'] = val_df['Tweet'].apply(lambda x: len(x.split()))

    tweet_vectors = val_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights= tfidf_weights_all_tweets))

    tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

    tweet_df = pd.DataFrame(tweet_vectors)

    period_features_val = pd.concat([val_df, tweet_df], axis=1)
    period_features_val = period_features_val.drop(columns=['Timestamp', 'Tweet', 'HomeTeam', 'AwayTeam'])
    period_features_val = period_features_val.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

    X = period_features_val.drop(columns=['MatchID', 'ID']).values

    # Reshape input for LSTM
    X_reshaped = X[:, None, :]  # Add timestep dimension

    preds = model.predict(X_reshaped)
    preds = preds.argmax(axis=1)  # Convert probabilities to class indices
    period_features_val['EventType'] = preds
    predictions.append(period_features_val[['ID', 'EventType']])


pred_df = pd.concat(predictions)
pred_df.to_csv('LSTM_predictions.csv', index=False)



In [None]:
import pickle
# Save TF-IDF trained on the training data
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
print(val_df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

# Load dataset
loaded_df = pd.read_pickle('preprocessed_train.pkl')

# Single vectorizer for consistent vocabulary
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(loaded_df['Tweet'])

# Separate event and non-event tweets
event_X = X[loaded_df['EventType'] == 1]
non_event_X = X[loaded_df['EventType'] == 0]

# Compute mean TF-IDF scores for sparse matrices
event_tfidf_scores = np.array(event_X.mean(axis=0)).flatten()
non_event_tfidf_scores = np.array(non_event_X.mean(axis=0)).flatten()

# Get feature names
tfidf_words = vectorizer.get_feature_names_out()

# Compute differences and sort
tfidf_differences = event_tfidf_scores - non_event_tfidf_scores
tfidf_word_differences = sorted(zip(tfidf_words, tfidf_differences), key=lambda x: abs(x[1]), reverse=True)

# Output top words
print("Top words by TF-IDF difference:")
print(tfidf_word_differences[:10])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import pandas as pd

# Convert tweets into a bag-of-words representation
vectorizer = CountVectorizer(max_features=5000, stop_words='english', binary=True)
X = vectorizer.fit_transform(loaded_df['Tweet'])  # Word presence (binary matrix)
y = loaded_df['EventType']  # Binary labels: 1 (event), 0 (no event)

# Perform chi-square test
chi2_scores, p_values = chi2(X, y)

# Create a DataFrame of words and chi-square scores
chi2_results = pd.DataFrame({
    'Word': vectorizer.get_feature_names_out(),
    'Chi2_Score': chi2_scores,
    'P_Value': p_values
}).sort_values(by='Chi2_Score', ascending=False)

# Display the top 10 words most associated with events
print("Top event-specific words by Chi2 score:")
print(chi2_results.head(10))

In [None]:
name_list = [
    'messi', 'ronaldo', 'neymar', 'beckham', 'villa', 'modric', 'suarez', 'griezmann',
    'mbappe', 'kroos', 'kane', 'iniesta', 'xavi', 'pogba', 'drogba', 'gerard', 'hummels',
    'ribery', 'salah', 'hazard', 'benzema', 'aguero', 'zlatan', 'kaka', 'rooney', 'degea',
    'ozil', 'bale', 'schweinsteiger', 'pirlo', 'terry', 'alves', 'pique', 'david', 'cristiano', 
    'kramer', 'klose', 'muller', 'lahm', 'neuer', 'gotze', 'reus', 'schurrle', 'kroos', 'boateng',
    'cromex', 'chicharito', 'torres', 'leroy', 'hernandez', 'christoph', 'vertonghen', 'john',
    'javier', 'fernandez', 'slimani', 'matshummels', 'romero', 'porgha', 'marquez', 'guajevilla',
    'honsui', 'cahill', 'neymarjr', 'porgha'
]


def remove_rows_with_names(df, name_list):
    # Filter the DataFrame to exclude rows where the "Word" column matches any name in the name list
    return df[~df['Word'].str.lower().isin(name_list)]


In [None]:
chi2_results = remove_rows_with_names(chi2_results, name_list)



In [None]:
print(chi2_results)

In [None]:
top_words = chi2_results[:62]

In [None]:
# Create a dictionary for the chi² scores
chi2_dict = dict(zip(top_words['Word'], top_words['Chi2_Score']))

# Compute weighted score features for each important word
for word in chi2_dict.keys():
    loaded_df[f'WeightedScore_{word}'] = loaded_df['Tweet'].apply(
        lambda tweet: tweet.split().count(word) * chi2_dict.get(word, 0)
    )

print("loaded_df shape:", loaded_df.shape)


In [None]:
loaded_df.to_pickle('preprocessed_train_extra_weights.pkl')

In [None]:
correlation = loaded_df[['TweetLength', 'WordCount']].corr()
print("Correlation between TweetLength and WordCount:")
print(correlation)


In [None]:
import matplotlib.pyplot as plt

plt.scatter(loaded_df['TweetLength'], loaded_df['WordCount'], alpha=0.5)
plt.xlabel('TweetLength')
plt.ylabel('WordCount')
plt.title('Relationship Between Tweet Length and Word Count')
plt.show()

In [None]:
# Average word length in a tweet
loaded_df['AvgWordLength'] = loaded_df['TweetLength'] / (loaded_df['WordCount'] + 1)  # Add 1 to avoid division by zero
