In [43]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

import gensim.downloader as api

import xgboost as xgb

import optuna

In [44]:
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [45]:
X = np.load("../DataPreprocessing/cleaned_train_dataset_megafinal_processed.npy", allow_pickle=True)
y = np.load("../DataPreprocessing/cleaned_eval_dataset_training.npy", allow_pickle=True)

df_X = pd.DataFrame(X)
df_y = pd.DataFrame(y)

In [46]:
df_X = df_X.rename(columns={0: 'ID', 1: 'EventType', 2: 'Tweet'})
df_y = df_y.rename(columns={0: 'ID', 1: 'Tweet'})

In [47]:
# Add model to generate sentence embeddings
embedding_model = api.load("glove-twitter-200")

# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embedding_model, vector_size) for tweet in df_X['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)

In [48]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df_X, tweet_df], axis=1)

In [49]:
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Tweet'])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['ID']).mean().reset_index()

In [50]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=['EventType', 'ID'])
# We extract the labels of our training samples
y = period_features['EventType'].to_list()

In [51]:
# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [52]:
clf_ = xgb.XGBClassifier(learning_rate = 0.1, n_estimators = 100).fit(X_train, y_train)

In [53]:
y_pred_ = clf_.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred_))

Test set:  0.838006230529595


In [54]:
# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embedding_model, vector_size) for tweet in df_y['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)

In [55]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df_y, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Tweet'])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['ID']).mean().reset_index()

In [56]:
X = period_features.drop(columns=['ID'])

In [57]:
y_pred_ = clf_.predict(X)

In [58]:
# add y_pred as a column to dataframe df_y_new
df_y['EventType'] = y_pred_

In [59]:
df_y['EventType'].value_counts()

EventType
1    309
0    207
Name: count, dtype: int64

In [60]:
# to csv
df_y[['ID', 'EventType']].to_csv('submission_tweet_embedding_xgboost.csv', index=False)