In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

import gensim.downloader as api

import optuna

In [5]:
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [3]:
X = np.load("../DataPreprocessing/cleaned_train_dataset_megafinal_processed.npy", allow_pickle=True)
y = np.load("../DataPreprocessing/cleaned_eval_dataset_training.npy", allow_pickle=True)

df_X = pd.DataFrame(X)
df_y = pd.DataFrame(y)

In [4]:
# Print min and max length of tweet in symbols
print("Min length: ", df_X[2].apply(len).min())
print("Max length: ", df_X[2].apply(len).max())

Min length:  1641
Max length:  195007


In [None]:
# Add model to generate sentence embeddings
embedding_model = api.load("glove-twitter-200")

# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embedding_model, vector_size) for tweet in df_X[2]])
tweet_df = pd.DataFrame(tweet_vectors)


In [None]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df_X, tweet_df], axis=1)

In [81]:
# Обучи модель логистической регрессии на данных. Для обучения на вход нужно подавать данные df_X['embedding'], а на выходе df_X[1]. 
# Для тестирования d_y[1]. Полученные после тестирования ответы сохрани в d_y['predicted'].

X_train, X_test, y_train, y_test = train_test_split(df_X['embeddings'].to_list(), df_X[1].to_list(), test_size=0.15, random_state=42)

# Train the model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Test the model
y_predicted = logistic_model.predict(list(X_test))

# Save the results
df_y['predicted'] = logistic_model.predict(list(df_y['embeddings']))

# Calculate the mean squared error
print("Error ", accuracy_score(y_test, y_predicted))

df_X['predicted'] = logistic_model.predict(list(df_X['embeddings']))

Error  0.6386292834890965


In [None]:
# count error acuracy score
print(accuracy_score(df_X[1].to_list(), df_X['predicted'].to_list()))  


0.7384183434721572


In [75]:
def objective(trial):
    # Гиперпараметры для оптимизации
    C = trial.suggest_float("C", 1e-4, 1e4)  # Регуляризация
    max_iter = trial.suggest_int("max_iter", 100, 10000)

    # Обучение модели
    model = LogisticRegression(C=C, max_iter=max_iter, solver="lbfgs", random_state=42)
    model.fit(X_train, y_train)
    
    # Оценка модели
    y_pred = model.predict(X_test)
    return 1 - accuracy_score(y_test, y_pred)

# Запуск оптимизации
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Результаты оптимизации
best_params = study.best_params
print("Лучшие параметры:", best_params)

# Обучение с оптимальными гиперпараметрами
best_model = LogisticRegression(**best_params, solver="lbfgs", random_state=42)
best_model.fit(X_train, y_train)

# Оценка производительности
y_predicted = best_model.predict(X_test)
mse = accuracy_score(y_test, y_predicted)
print(f"Лучший MSE: {mse:.4f}")

[I 2024-12-12 14:07:55,796] A new study created in memory with name: no-name-a95e366c-c5a4-425e-b90e-723a7dcc0b51
[I 2024-12-12 14:07:56,370] Trial 0 finished with value: 0.3800623052959502 and parameters: {'C': 8627.5051487697, 'max_iter': 8438}. Best is trial 0 with value: 0.3800623052959502.
[I 2024-12-12 14:07:59,101] Trial 1 finished with value: 0.3800623052959502 and parameters: {'C': 5256.265622360428, 'max_iter': 5666}. Best is trial 0 with value: 0.3800623052959502.
[I 2024-12-12 14:08:00,476] Trial 2 finished with value: 0.3800623052959502 and parameters: {'C': 6478.790549162912, 'max_iter': 8050}. Best is trial 0 with value: 0.3800623052959502.
[I 2024-12-12 14:08:01,679] Trial 3 finished with value: 0.3800623052959502 and parameters: {'C': 9286.715529130672, 'max_iter': 8436}. Best is trial 0 with value: 0.3800623052959502.
[I 2024-12-12 14:08:03,495] Trial 4 finished with value: 0.383177570093458 and parameters: {'C': 1235.9310813422628, 'max_iter': 7466}. Best is trial 0 

Лучшие параметры: {'C': 17.58281877573898, 'max_iter': 1303}
Лучший MSE: 0.6355


In [1]:
df_y = df_y.rename(columns={0: 'ID'})
df_y = df_y.rename(columns={'predicted': 'EventType'})

NameError: name 'df_y' is not defined

In [None]:
# сохрани айди и ивент тайп в csv
df_y[['ID', 'EventType']].to_csv('submission.csv', index=False)

In [84]:
# посчитай кол-во нулей и единиц в колонке predicted df_y
df_y['predicted'].value_counts()

predicted
1    277
0    239
Name: count, dtype: int64

In [104]:
df_X_new = pd.DataFrame(X)
df_y_new = pd.DataFrame(y)

In [106]:
# переименуй в df_X_new 0 в ID, 1 в EventType, 2 в Tweet
df_X_new = df_X_new.rename(columns={0: 'ID', 1: 'EventType', 2: 'Tweet'})

In [None]:
df_y_new = df_y_new.rename(columns={0: 'ID', 1: 'Tweet'})

In [110]:
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df_X_new['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)

In [111]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df_X_new, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Tweet'])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['ID']).mean().reset_index()

In [119]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=['EventType', 'ID'])
# We extract the labels of our training samples
y = period_features['EventType'].to_list()

In [120]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.7679127725856698


In [143]:
import xgboost as xgb

clf_ = xgb.XGBClassifier(learning_rate = 0.1, n_estimators = 100).fit(X_train, y_train)


In [144]:
y_pred_ = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred_))

Test set:  0.7679127725856698


In [129]:
# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df_y_new['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)

In [130]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df_y_new, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Tweet'])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['ID']).mean().reset_index()

In [131]:
X = period_features.drop(columns=['ID'])

In [132]:
y_pred = clf.predict(X)

In [None]:
# add y_pred as a column to dataframe df_y_new
df_y_new['EventType'] = y_pred

In [149]:
df_y_new['EventType'].value_counts()

EventType
1    309
0    207
Name: count, dtype: int64

In [142]:
# сохрани айди и ивент тайп в csv
df_y[['ID', 'EventType']].to_csv('submission_tweet_embedding.csv', index=False)