In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder
import gensim.downloader as api
import re
import swifter

# Enable tqdm for pandas
tqdm.pandas()

2024-11-20 21:46:36.740374: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Optional: Transformer embedding

In [11]:
# Load data
folder_path = "train_tweets"
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")][:1] # Only use first file for testing purposes
df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# Apply preprocessing
df['Tweet'] = df['Tweet'].swifter.apply(preprocess_text)
print(df.head)

Pandas Apply: 100%|██████████| 86843/86843 [00:15<00:00, 5457.13it/s]


<bound method NDFrame.head of           ID  MatchID  PeriodID  EventType      Timestamp  \
0        2_0        2         0          0  1403538600000   
1        2_0        2         0          0  1403538600000   
2        2_0        2         0          0  1403538600000   
3        2_0        2         0          0  1403538600000   
4        2_0        2         0          0  1403538600000   
...      ...      ...       ...        ...            ...   
86838  2_129        2       129          1  1403546400000   
86839  2_129        2       129          1  1403546400000   
86840  2_129        2       129          1  1403546400000   
86841  2_129        2       129          1  1403546400000   
86842  2_129        2       129          1  1403546400000   

                                                   Tweet  
0      rt soccerdotcom esp beat au well give away spa...  
1      visit sitep official web site httptcoehzkslan ...  
2      rt soccerdotcom esp beat au well give away spa...  
3

In [5]:
# Load GloVe model
glove_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

In [6]:
# Generate embeddings for each tweet
vector_size = 200  # GloVe embedding dimension
tweet_vectors = df['Tweet'].swifter.apply(lambda x: get_avg_embedding(x, glove_model, vector_size))
tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

#tweet_vectors = df['Tweet'].swifter.apply(get_tweet_embedding)

Pandas Apply: 100%|██████████| 86843/86843 [00:03<00:00, 22304.41it/s]


In [7]:
tweet_df = pd.DataFrame(tweet_vectors)

# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet'])

print("X_train_reshaped shape:", period_features.shape)
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
# We extract the labels of our training samples
y = period_features['EventType'].values


# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Add a time step dimension to match the LSTM input shape
X_train_reshaped = X_train[:, None, :]  # Add a new axis for timesteps
X_test_reshaped = X_test[:, None, :]    # Add a new axis for timesteps



# Define the LSTM model
model = Sequential([
    tf.keras.layers.Input(shape=(1, vector_size)),  # Input layer
    LSTM(128, return_sequences=False),             # LSTM layer with 128 units
    Dense(y_encoded.shape[1], activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_reshaped, y_train,
                    epochs=10,
                    batch_size=64,
                    validation_split=0.2,
                    verbose=1)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, verbose=1)

print(f"Test Accuracy: {test_accuracy:.4f}")

X_train_reshaped shape: (86843, 204)
Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 306ms/step - accuracy: 0.4804 - loss: 0.6992 - val_accuracy: 0.7619 - val_loss: 0.6639
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.5702 - loss: 0.6793 - val_accuracy: 0.7619 - val_loss: 0.6241
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.5702 - loss: 0.6639 - val_accuracy: 0.7619 - val_loss: 0.5973
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.5390 - loss: 0.6610 - val_accuracy: 0.7619 - val_loss: 0.5788
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5650 - loss: 0.6415 - val_accuracy: 0.7619 - val_loss: 0.5596
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5858 - loss: 0.6266 - val_accuracy: 0.7619 - val_loss: 0.5468
Epoch 7/

In [8]:
###### For Kaggle submission

predictions = []
dummy_predictions = []
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    
    val_df['Tweet'] = val_df['Tweet'].swifter.apply(preprocess_text)

    tweet_vectors = val_df['Tweet'].swifter.apply(lambda x: get_avg_embedding(x, glove_model, vector_size))

    tweet_vectors = np.array(list(tweet_vectors), dtype=np.float32)

    tweet_df = pd.DataFrame(tweet_vectors)

    period_features = pd.concat([val_df, tweet_df], axis=1)
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

    X = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    # Reshape input for LSTM
    X_reshaped = X[:, None, :]  # Add timestep dimension

    preds = model.predict(X_reshaped)
    preds = preds.argmax(axis=1)  # Convert probabilities to class indices
    period_features['EventType'] = preds
    predictions.append(period_features[['ID', 'EventType']])


pred_df = pd.concat(predictions)
pred_df.to_csv('LSTM_predictions.csv', index=False)



Pandas Apply: 100%|██████████| 285804/285804 [00:50<00:00, 5627.99it/s]
Pandas Apply: 100%|██████████| 285804/285804 [00:09<00:00, 29237.38it/s]


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


Pandas Apply: 100%|██████████| 45024/45024 [00:07<00:00, 5861.59it/s]
Pandas Apply: 100%|██████████| 45024/45024 [00:01<00:00, 26142.96it/s]


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


Pandas Apply: 100%|██████████| 628698/628698 [01:43<00:00, 6070.59it/s]
Pandas Apply: 100%|██████████| 628698/628698 [00:19<00:00, 31739.00it/s]


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


Pandas Apply: 100%|██████████| 113402/113402 [00:18<00:00, 6202.32it/s]
Pandas Apply: 100%|██████████| 113402/113402 [00:03<00:00, 31547.59it/s]


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
