In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
import re
import swifter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform, Orthogonal
import random
# Enable tqdm for pandas
tqdm.pandas()

# Ensure Reproducibility
import random
# Set seeds for reproducibility
SEED = 42

# Python's built-in random
random.seed(SEED)

# NumPy
np.random.seed(SEED)

# TensorFlow
tf.random.set_seed(SEED)

# Set Python hash seed
os.environ['PYTHONHASHSEED'] = str(SEED)

# Configure TensorFlow for deterministic operations
tf.keras.utils.set_random_seed(SEED)  # Sets all random seeds for the program (Python, NumPy, and TensorFlow)
tf.config.experimental.enable_op_determinism()  # Enable deterministic operations in TensorFlow

# If using GPU, you might also want to set these:
if tf.config.list_physical_devices('GPU'):
    # Force TensorFlow to use deterministic GPU operations
    tf.config.experimental.enable_op_determinism()
    # Limit GPU memory growth
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Limit to one GPU if using multiple GPUs
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

2024-12-09 17:27:36.453988: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load GloVe model
glove_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

In [3]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Initialize stopwords and stemmer
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Process a single tweet
def preprocess_tweet(tweet):
    if pd.isna(tweet):
        return ""
    
    # Remove Retweets using Regex
    if re.match(r'^RT\s+@\w+:', str(tweet)):
        return ""

    # Clean the Tweet
    tweet = re.sub(r"http\S+|@\S+|#\S+|[^a-zA-Z\s]", "", str(tweet).lower())
    words = tweet.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    return " ".join(words)



In [None]:
# Load data
folder_path = "train_tweets"
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")] # Only use first file for testing purposes
df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# Apply preprocessing
df['Tweet'] = df['Tweet'].swifter.apply(preprocess_tweet)
# Remove rows where the 'Tweet' column is an empty string
df = df[df['Tweet'].str.strip() != ""]


# Count and remove duplicates
before_dedup = len(df)
df = df.drop_duplicates(subset='Tweet')
after_dedup = len(df)

# Display number of duplicates removed
num_duplicates_removed = before_dedup - after_dedup
print(f"Number of duplicates removed: {num_duplicates_removed}")

# Reset the index
df.reset_index(drop=True, inplace=True)

print(df.head)

Pandas Apply: 100%|██████████| 5056050/5056050 [03:59<00:00, 21118.25it/s]


Number of duplicates removed: 523172
<bound method NDFrame.head of              ID  MatchID  PeriodID  EventType      Timestamp  \
1           2_0        2         0          0  1403538600000   
6           2_0        2         0          0  1403538600000   
8           2_0        2         0          0  1403538601000   
10          2_0        2         0          0  1403538601000   
11          2_0        2         0          0  1403538601000   
...         ...      ...       ...        ...            ...   
5056038  17_129       17       129          1  1403805600000   
5056039  17_129       17       129          1  1403805600000   
5056040  17_129       17       129          1  1403805600000   
5056042  17_129       17       129          1  1403805600000   
5056049  17_129       17       129          1  1403805600000   

                                                     Tweet  
1        visit offici web site spani real state tourim ...  
6                                         

In [8]:
print(df)

             ID  MatchID  PeriodID  EventType      Timestamp  \
0           2_0        2         0          0  1403538600000   
1           2_0        2         0          0  1403538600000   
2           2_0        2         0          0  1403538601000   
3           2_0        2         0          0  1403538601000   
4           2_0        2         0          0  1403538601000   
...         ...      ...       ...        ...            ...   
1907140  17_129       17       129          1  1403805600000   
1907141  17_129       17       129          1  1403805600000   
1907142  17_129       17       129          1  1403805600000   
1907143  17_129       17       129          1  1403805600000   
1907144  17_129       17       129          1  1403805600000   

                                                     Tweet  
0        visit offici web site spani real state tourim ...  
1                                         today match good  
2                                           min 

In [9]:
# Feature creation
# Add the length of each tweet as a feature
df['TweetLength'] = df['Tweet'].apply(len)

# Add a simple tweet count feature
df['TweetCount'] = df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')

# Add word count as a feature
df['WordCount'] = df['Tweet'].apply(lambda x: len(x.split()))

print(df)

             ID  MatchID  PeriodID  EventType      Timestamp  \
0           2_0        2         0          0  1403538600000   
1           2_0        2         0          0  1403538600000   
2           2_0        2         0          0  1403538601000   
3           2_0        2         0          0  1403538601000   
4           2_0        2         0          0  1403538601000   
...         ...      ...       ...        ...            ...   
1907140  17_129       17       129          1  1403805600000   
1907141  17_129       17       129          1  1403805600000   
1907142  17_129       17       129          1  1403805600000   
1907143  17_129       17       129          1  1403805600000   
1907144  17_129       17       129          1  1403805600000   

                                                     Tweet  TweetLength  \
0        visit offici web site spani real state tourim ...           53   
1                                         today match good           16   
2     

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Compute TF-IDF weights for the corpus

# Optional: start from zero and fit on tweets
vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(df['Tweet'])


# Load pre-computed
#with open('tfidf_vectorizer.pkl', 'rb') as f:
#    vectorizer = pickle.load(f)


tfidf_weights = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

# Weighted average embeddings
def get_weighted_avg_embedding(tweet, model, vector_size=200, weights=tfidf_weights):
    words = tweet.split()
    word_vectors = [model[word] * weights.get(word, 1) for word in words if word in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [11]:
# Generate embeddings for each tweet
vector_size = 200  # GloVe embedding dimension
tweet_vectors = df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights))
tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

# Save the tweet vectors
# with open("tweet_vectors.pkl", "wb") as f:
#     pickle.dump(tweet_vectors, f)
# 
# print("Embeddings saved successfully!")
# 
# # Load the tweet vectors
# with open("tweet_vectors.pkl", "rb") as f:
#     loaded_tweet_vectors = pickle.load(f)
# 
# print("Embeddings loaded successfully!")
# print("Loaded vectors shape:", loaded_tweet_vectors.shape)



Pandas Apply: 100%|██████████| 1907145/1907145 [01:57<00:00, 16215.95it/s]


In [12]:
###### Use if no period features ######
tweet_df = pd.DataFrame(tweet_vectors)

# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_df], axis=1)

# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet'])

print("X_train_reshaped shape:", period_features.shape)
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

## Save the tweet vectors
#with open("period_features.pkl", "wb") as f:
#    pickle.dump(period_features, f)
#
#print("Period features saved successfully!")
#
## Load the tweet vectors
#with open("period_features_sentiment.pkl", "rb") as f:
#    loaded_period_features = pickle.load(f)
#
#print("Period features loaded successfully!")
#print("Loaded vectors shape:", loaded_period_features.shape)
#print(loaded_period_features)

X_train_reshaped shape: (1907145, 207)


In [13]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=['EventType', 'MatchID', 'ID']).values
# We extract the labels of our training samples
y = period_features['EventType'].values


# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=SEED)

# Add a time step dimension to match the LSTM input shape
X_train_reshaped = X_train[:, None, :]  # Add a new axis for timesteps
X_test_reshaped = X_test[:, None, :]    # Add a new axis for timesteps
print(period_features)

      MatchID  PeriodID      ID  EventType  TweetLength  TweetCount  \
0           0         0     0_0        0.0    44.237288    2.186441   
1           0         1     0_1        0.0    43.120000    1.760000   
2           0         2     0_2        0.0    43.476190    2.111111   
3           0         3     0_3        0.0    39.743243    2.405405   
4           0         4     0_4        0.0    41.413793    2.632184   
...       ...       ...     ...        ...          ...         ...   
2132       19       125  19_125        1.0    32.786948    9.525912   
2133       19       126  19_126        1.0    33.613497    8.963190   
2134       19       127  19_127        1.0    34.648230    8.690265   
2135       19       128  19_128        1.0    34.695876    7.680412   
2136       19       129  19_129        1.0    35.677054    6.603399   

      WordCount         0         1         2  ...       190       191  \
0      7.559322  0.575533  1.259582  0.292536  ...  0.510598 -0.335112   

In [18]:
# Define the early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',      # Monitor validation loss
    patience=5,              # Stop training if no improvement after 3 epochs
    restore_best_weights=True  # Restore the best weights when stopping
)

# Define the LSTM model with deterministic initializers
model = Sequential([
    tf.keras.layers.Input(shape=(1, X_train_reshaped.shape[2])),  
    LSTM(
        128, 
        return_sequences=False, 
        kernel_initializer=GlorotUniform(seed=SEED), 
        recurrent_initializer=Orthogonal(seed=SEED),
        bias_initializer='zeros'
    ),             
    Dense(y_encoded.shape[1], activation='softmax', kernel_initializer=GlorotUniform(seed=SEED))
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping],  # Include the early stopping callback
                    verbose=1)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=1)

print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5915 - loss: 0.6752 - val_accuracy: 0.5994 - val_loss: 0.6505
Epoch 2/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6537 - loss: 0.6235 - val_accuracy: 0.6667 - val_loss: 0.6163
Epoch 3/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6909 - loss: 0.5903 - val_accuracy: 0.6988 - val_loss: 0.5906
Epoch 4/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7116 - loss: 0.5625 - val_accuracy: 0.7076 - val_loss: 0.5763
Epoch 5/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7305 - loss: 0.5418 - val_accuracy: 0.7222 - val_loss: 0.5648
Epoch 6/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7375 - loss: 0.5259 - val_accuracy: 0.7222 - val_loss: 0.5580
Epoch 7/50
[1m43/43[0m [32m━━━━━━━━━━

In [16]:
###### For Kaggle submission

predictions = []
dummy_predictions = []
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in sorted(os.listdir("eval_tweets")):
    val_df = pd.read_csv("eval_tweets/" + fname)
    
    val_df['Tweet'] = val_df['Tweet'].swifter.apply(preprocess_tweet)
    # Remove rows where the 'Tweet' column is an empty string
    val_df = val_df[val_df['Tweet'].str.strip() != ""]


    # Count and remove duplicates
    before_dedup_val = len(val_df)
    val_df = val_df.drop_duplicates(subset='Tweet')
    after_dedup_val = len(val_df)

    # Display number of duplicates removed
    num_duplicates_removed_val = before_dedup_val - after_dedup_val
    print(f"Number of duplicates removed: {num_duplicates_removed_val}")

    # Reset the index
    val_df.reset_index(drop=True, inplace=True)

    # Feature creation
    # Add the length of each tweet as a feature
    val_df['TweetLength'] = val_df['Tweet'].apply(len)
    
    # Add a simple tweet count feature
    val_df['TweetCount'] = val_df.groupby(['MatchID', 'PeriodID', 'Timestamp'])['Timestamp'].transform('count')
    
    # Add word count as a feature
    val_df['WordCount'] = val_df['Tweet'].apply(lambda x: len(x.split()))

    tweet_vectors = val_df['Tweet'].swifter.apply(lambda tweet: get_weighted_avg_embedding(tweet, model=glove_model, vector_size=200, weights=tfidf_weights))

    tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

    tweet_df = pd.DataFrame(tweet_vectors)

    period_features_val = pd.concat([val_df, tweet_df], axis=1)
    period_features_val = period_features_val.drop(columns=['Timestamp', 'Tweet'])
    period_features_val = period_features_val.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

    X = period_features_val.drop(columns=['MatchID', 'ID']).values

    # Reshape input for LSTM
    X_reshaped = X[:, None, :]  # Add timestep dimension

    preds = model.predict(X_reshaped)
    preds = preds.argmax(axis=1)  # Convert probabilities to class indices
    period_features_val['EventType'] = preds
    predictions.append(period_features_val[['ID', 'EventType']])


pred_df = pd.concat(predictions)
pred_df.to_csv('LSTM_predictions.csv', index=False)



Pandas Apply: 100%|██████████| 285804/285804 [00:09<00:00, 29294.37it/s]


Number of duplicates removed: 23919


Pandas Apply: 100%|██████████| 89198/89198 [00:04<00:00, 18434.82it/s]


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


Pandas Apply: 100%|██████████| 45024/45024 [00:04<00:00, 10146.85it/s]


Number of duplicates removed: 3220


Pandas Apply: 100%|██████████| 35175/35175 [00:02<00:00, 12261.00it/s]


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


Pandas Apply: 100%|██████████| 113402/113402 [00:07<00:00, 15826.25it/s]


Number of duplicates removed: 12524


Pandas Apply: 100%|██████████| 56945/56945 [00:02<00:00, 21371.27it/s]


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


Pandas Apply: 100%|██████████| 628698/628698 [00:30<00:00, 20441.45it/s]


Number of duplicates removed: 55427


Pandas Apply: 100%|██████████| 260223/260223 [00:13<00:00, 18714.43it/s]


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [None]:
import pickle
# Save TF-IDF trained on the training data
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)