In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cp -r /content/drive/MyDrive/challenge_data/train_tweets/ ./
!cp -r /content/drive/MyDrive/challenge_data/eval_tweets/ ./

In [None]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
import nltk
import os
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing punctuation, numbers, stopwords, and lemmatizing.

    Args:
        text (str): Text to be preprocessed.

    Returns:
        str: Preprocessed text.
    """
    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenization
    words = text.split()

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [None]:
def format_number_to_string(number):
    """
    Formats a number into a 3-character string, adding leading zeros if necessary.

    Args:
        number (int): A number between 0 and 999.

    Returns:
        str: A string of length 3.
    """
    if not (0 <= number <= 999):
        raise ValueError("The number must be in the range 0 to 999.")

    return f"{number:03d}"

In [161]:
def process_csv(file_path, l, with_period_id, with_event_type):
    """
    Process a CSV file to extract and tokenize data.

    Args:
        file_path (str): Path to the CSV file.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType', and 'Tweet'.
    """
    # Load the tokenizer (default tokenizer from transformers)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Read the CSV file
    df = pd.read_csv(file_path)

    # Extract required columns
    # if with_event_type:
    #     df = df[['PeriodID', 'EventType', 'Tweet']]
    # else:
    #     df = df[['PeriodID', 'Tweet']]
    # df = df[['ID', 'Tweet', 'EventType']]
    df = df[['ID', 'Tweet']]

    df['Tweet'] = df['Tweet'].apply(preprocess_text)

    # Tokenize the 'Tweet' column and pad/truncate to length l
    def tokenize_tweet(tweet):
        tokens = tokenizer.encode(tweet, truncation=True, max_length=l, add_special_tokens=False)
        return tokens

    df['Tweet'] = df['Tweet'].apply(tokenize_tweet)

    df = df.groupby(['ID'])['Tweet'].apply(list).reset_index()

    return df

In [162]:
def read_csv(folder_path, with_period_id, with_event_type, l=32):
    """
    Read all CSV files in a folder and process them.

    Args:
        folder_path (str): Path to the folder containing the CSV files.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType' (optional), and 'Tweet' (tokenized).
    """
    li = []
    for filename in os.listdir(folder_path):
        df = process_csv(folder_path + filename, l, with_period_id, with_event_type)
        li.append(df)
    df = pd.concat(li, ignore_index=True)
    return df

In [None]:
df = pd.read_csv("./train_tweets/ArgentinaBelgium72.csv")

In [155]:
df = read_csv("./train_tweets/", True, True)

In [156]:
np.save('drive/MyDrive/train_without_padding_32.npy', df.to_numpy())

In [157]:
df

Unnamed: 0,ID,EventType,Tweet
0,2_0,0,"[[19387, 4715, 27364, 9006, 9686, 2361, 3786, ..."
1,2_1,0,"[[5294, 3231, 15835, 2190, 4715, 2136, 2088, 2..."
2,2_10,1,"[[8239, 3109, 8239, 3109, 9686, 2361, 17151, 2..."
3,2_100,1,"[[19387, 5713, 11108, 15569, 3125, 8740, 9686,..."
4,2_101,1,"[[19387, 9433, 8040, 10936, 2953, 2721, 2999, ..."
...,...,...,...
2132,5_95,0,"[[19387, 8484, 10085, 17119, 22231, 2361, 2531..."
2133,5_96,1,"[[10047, 2139, 3527, 3676, 2490, 5619, 2088, 2..."
2134,5_97,1,"[[25312, 15088, 4590, 2659, 3694, 25416, 9910,..."
2135,5_98,1,"[[19387, 3329, 2100, 1035, 13198, 11748, 8569,..."


In [None]:
np.save('train.')

In [163]:
df_eval = read_csv("./eval_tweets/", True, False)

In [164]:
np.save('drive/MyDrive/eval_without_padding_32.npy', df_eval.to_numpy())

In [None]:
import numpy as np

df_numpy = np.load("./drive/MyDrive/small_new_preproc.npy", allow_pickle=True)

In [None]:
import numpy as np

df_numpy = np.load("./drive/MyDrive/to_test.npy", allow_pickle=True)

In [None]:
import pandas as pd

df = pd.DataFrame(df_numpy)

In [None]:
from transformers import BertTokenizer, BertModel

model = BertModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch.nn as nn
import torch

initial_embeds = model.embeddings.word_embeddings.weight
embeddings_layer = nn.Embedding.from_pretrained(torch.tensor(initial_embeds))

  embeddings_layer = nn.Embedding.from_pretrained(torch.tensor(initial_embeds))


In [None]:
embeddings_layer.to('cuda')

Embedding(30522, 768)

In [None]:
import gc

def embed(x):
    tens = torch.tensor(x).to('cuda')
    embeddings = embeddings_layer(tens)

    del tens
    embeddings = embeddings.to('cpu')
    gc.collect()
    torch.cuda.empty_cache()

    return embeddings.numpy()

In [None]:
def average_token_embed(x):
    tens = torch.tensor(x).to('cuda')
    embeddings = embeddings_layer(tens)

    del tens
    embeddings = embeddings.to('cpu')
    gc.collect()
    torch.cuda.empty_cache()

    return embeddings.numpy().mean(axis=1)

In [None]:
def average_tweet_token_embed(x):
    tens = torch.tensor(x).to('cuda')
    embeddings = embeddings_layer(tens)

    del tens
    embeddings = embeddings.to('cpu')
    gc.collect()
    torch.cuda.empty_cache()

    return embeddings.numpy().mean(axis=1).mean(axis=0)

In [None]:
x = embed(df[2][0])

In [None]:
x.shape

(2330, 32, 768)

In [None]:
x = average_token_embed(df[2][0])

In [None]:
x.shape

(2330, 768)

In [None]:
x = average_tweet_token_embed(df[2][0])

In [None]:
df['Tweet_token_embed'] = df[2].apply(average_token_embed).apply(lambda x: x[:])

In [None]:
df['Tweet_tweet_embed'] = df[2].apply(lambda x: x[:300]).apply(average_token_embed)

Exception ignored in: <function _xla_gc_callback at 0x7b5e83b0c310>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 
Exception ignored in: <function _xla_gc_callback at 0x7b5e83b0c310>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 
Exception ignored in: <function _xla_gc_callback at 0x7b5e83b0c310>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 
Exception ignored in: <function _xla_gc_callback at 0x7b5e83b0c310>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4
0,0,0,"[[19387, 24185, 14615, 15569, 5619, 1058, 5706...","[-0.019213643, -0.046984125, -0.024342172, -0....","[[-0.022777032, -0.05549614, -0.025558233, -0...."
1,0,1,"[[1035, 8230, 10270, 2278, 2272, 5706, 0, 0, 0...","[-0.018859906, -0.04761093, -0.024497103, -0.0...","[[-0.012318136, -0.055603355, -0.025524922, -0..."
2,0,2,"[[4532, 3449, 10354, 12031, 22542, 2100, 6433,...","[-0.017854355, -0.048522584, -0.024735624, -0....","[[-0.013745017, -0.046950784, -0.024718862, -0..."
3,0,3,"[[12098, 2290, 1058, 19337, 18168, 2290, 2066,...","[-0.018232048, -0.048054412, -0.024763383, -0....","[[-0.012938696, -0.05019502, -0.026628945, -0...."
4,0,4,"[[5619, 19337, 25698, 2213, 6069, 27571, 9541,...","[-0.018128952, -0.048253, -0.024844358, -0.039...","[[-0.012660683, -0.05150302, -0.029625246, -0...."


In [None]:
np.save("to_test.npy", df.to_numpy())

In [None]:
import itertools

df_g['Tweet'] = df_g['Tweet'].apply(lambda x: list(itertools.chain.from_iterable(x)))

In [None]:
df_g

Unnamed: 0,PeriodID,Tweet
0,0,"[19387, 24185, 14615, 15569, 14063, 2386, 1035..."
1,1,"[1035, 8230, 10270, 2278, 19387, 6752, 11921, ..."
2,2,"[4532, 3449, 10354, 12031, 19387, 4715, 27364,..."
3,3,"[12098, 2290, 1058, 19337, 19387, 8484, 10085,..."
4,4,"[5619, 19337, 25698, 2213, 4642, 2239, 5706, 2..."


In [None]:
X = df['Tweet_embed'].to_list()
y = df[0].to_list()

In [None]:
X = df['Tweet_tweet_embed'].to_numpy()
y = df[0].to_list()

In [None]:
df

Unnamed: 0,0,1,2,3,4
0,0,0,"[[19387, 24185, 14615, 15569, 5619, 1058, 5706...","[-0.019213643, -0.046984125, -0.024342172, -0....","[[-0.022777032, -0.05549614, -0.025558233, -0...."
1,0,1,"[[1035, 8230, 10270, 2278, 2272, 5706, 0, 0, 0...","[-0.018859906, -0.04761093, -0.024497103, -0.0...","[[-0.012318136, -0.055603355, -0.025524922, -0..."
2,0,2,"[[4532, 3449, 10354, 12031, 22542, 2100, 6433,...","[-0.017854355, -0.048522584, -0.024735624, -0....","[[-0.013745017, -0.046950784, -0.024718862, -0..."
3,0,3,"[[12098, 2290, 1058, 19337, 18168, 2290, 2066,...","[-0.018232048, -0.048054412, -0.024763383, -0....","[[-0.012938696, -0.05019502, -0.026628945, -0...."
4,0,4,"[[5619, 19337, 25698, 2213, 6069, 27571, 9541,...","[-0.018128952, -0.048253, -0.024844358, -0.039...","[[-0.012660683, -0.05150302, -0.029625246, -0...."
...,...,...,...,...,...
385,1,120,"[[19387, 1996, 12155, 14615, 15569, 4380, 1337...","[-0.016891535, -0.047930527, -0.024227334, -0....","[[-0.015976857, -0.049933832, -0.021148255, -0..."
386,1,121,"[[19387, 6300, 5937, 2953, 2854, 29573, 16693,...","[-0.017256346, -0.047548335, -0.024222465, -0....","[[-0.020891888, -0.055903964, -0.02755709, -0...."
387,1,123,"[[1051, 11631, 2242, 11655, 10856, 26527, 2098...","[-0.01785853, -0.04707737, -0.024351297, -0.03...","[[-0.014975686, -0.056602668, -0.02412894, -0...."
388,1,125,"[[2012, 19738, 3367, 3915, 2439, 2762, 3053, 1...","[-0.018380594, -0.04664928, -0.024481935, -0.0...","[[-0.022475827, -0.03995397, -0.013322602, -0...."


In [172]:
import itertools

# X = df['Tweet'].apply(lambda x: list(itertools.chain.from_iterable(x))[:4000]).to_list()
X = df['Tweet'].apply(lambda x: list(itertools.chain.from_iterable(x))).to_list()
X_eval = df_eval['Tweet'].apply(lambda x: list(itertools.chain.from_iterable(x))).to_list()
y = df['EventType'].to_list()

In [175]:
import itertools
import random

X = df['Tweet'].apply(lambda x: random.sample(list(itertools.chain.from_iterable(x)), min(1800, len(list(itertools.chain.from_iterable(x)))))).to_list()
X_eval = df_eval['Tweet'].apply(lambda x: random.sample(list(itertools.chain.from_iterable(x)), min(1800, len(list(itertools.chain.from_iterable(x)))))).to_list()
y = df['EventType'].to_list()

In [173]:
l = 100000
for x in X:
    if len(x) < l:
      l = len(x)
      print(l)

9333
7805
6608
5845
5607
5286
5196
4020
3772
3740
2920
2607
2046
1886
1845


In [174]:
l = 100000
for x in X_eval:
    if len(x) < l:
      l = len(x)
      print(l)

44415
39403
38085
34183
28572
26518
26447
26007
24743
22656
22373
21922
21373
20248
19681
5294
4651
4409
3473
3231
2885
2352


In [15]:
df[2].apply(len).min()

1641

In [16]:
df_eval[1].apply(len).min()

5494

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
from sklearn.model_selection import train_test_split

def prepare_dataset(df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """

        # Tokenization function
        def tokenize_function(text):
            return tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=len(text)*3
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df

In [25]:
train_df, _ = prepare_dataset(df, 2, 1, 0.01)

In [26]:
eval_df, _ = prepare_dataset(df_eval, 1, 1, 0.01)

In [32]:
train_df['tokens'] = train_df['tokenized'].apply(lambda x: x['input_ids'][:4900])

In [33]:
eval_df['tokens'] = eval_df['tokenized'].apply(lambda x: x['input_ids'][:4900])

In [34]:
train_df['tokens'].apply(len).min()

4900

In [35]:
eval_df['tokens'].apply(len).min()

4900

In [2]:
import numpy as np
import pandas as pd

df_n = np.load('drive/MyDrive/cleaned_train_dataset_megafinal_processed.npy', allow_pickle=True)
df = pd.DataFrame(df_n)

In [3]:
df_eval_n = np.load('drive/MyDrive/cleaned_eval_dataset_training.npy', allow_pickle=True)
df_eval = pd.DataFrame(df_eval_n)

In [43]:
type(train_df['tokens'].to_list())

list

In [None]:
df = df[]

In [36]:
import itertools

# X = train_df[2].apply(lambda x: list(itertools.chain.from_iterable(x))[:4000]).to_list()
X = np.array(train_df['tokens'].to_list(), dtype=np.int32)
y = train_df[1].to_list()

In [6]:
X = df[3].apply(lambda x: x.tolist()).to_list()
y = df[0].to_list()

KeyError: 3

In [None]:
X = df[4].apply(lambda x: x.tolist()).to_list()
y = df[0].to_list()

In [None]:
(X.shape)

(390,)

In [None]:
type(X[0][0])

list

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.15)
X_valid, X_test, y_valid, y_test = train_test_split(X_, y_, test_size=0.5)

In [42]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


boosting_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    max_depth=10,
    learning_rate=0.1,
    n_estimators=100
)

In [None]:
boosting_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
final_accuracy = accuracy_score(y_test, boosting_model.predict(X_test))
print("Final Accuracy:", final_accuracy)

Final Accuracy: 0.6333333333333333


In [37]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [43]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'early_stopping_rounds' : 100,
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**param)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    preds = model.predict(X_test)
    final_accuracy = accuracy_score(y_test, preds)
    return 1 - final_accuracy


In [44]:
study = optuna.create_study(direction='minimize')  # For minimizing accuracy check
study.optimize(objective, n_trials=10)  # Adjust the number of trials as needed

# Print the best parameters and score
print("Best trial:")
print(study.best_trial.params)
print(f"Best: { 1 - study.best_value}")

[I 2024-12-12 13:44:10,965] A new study created in memory with name: no-name-2444bdd4-4f9b-4062-9211-72371e9399ca
[I 2024-12-12 13:45:00,666] Trial 0 finished with value: 0.4591194968553459 and parameters: {'booster': 'gbtree', 'lambda': 8.752401635452266, 'alpha': 3.6431634534695245, 'colsample_bytree': 0.7495234655279837, 'subsample': 0.9713774950201797, 'learning_rate': 0.02922934919590256, 'max_depth': 9, 'n_estimators': 160, 'min_child_weight': 7}. Best is trial 0 with value: 0.4591194968553459.
[I 2024-12-12 13:45:37,287] Trial 1 finished with value: 0.4591194968553459 and parameters: {'booster': 'gbtree', 'lambda': 1.6375478988212764, 'alpha': 4.395207613357456, 'colsample_bytree': 0.7718150306001039, 'subsample': 0.974997510845747, 'learning_rate': 0.02176613305378948, 'max_depth': 5, 'n_estimators': 153, 'min_child_weight': 3}. Best is trial 0 with value: 0.4591194968553459.
[I 2024-12-12 13:46:05,398] Trial 2 finished with value: 0.4591194968553459 and parameters: {'booster':

KeyboardInterrupt: 

In [None]:
boosting_model = XGBClassifier(**study.best_trial.params)

In [None]:
boosting_model.fit(X_train, y_train)

In [None]:
final_accuracy = accuracy_score(y_test, boosting_model.predict(X_test))
print("Final Accuracy:", final_accuracy)

Final Accuracy: 0.7


In [None]:
df_eval["EventType"] = boosting_model.predict(X_eval)

In [None]:
df_eval

Unnamed: 0,ID,Tweet,EventType
0,6_0,"[[2633, 2131, 2156, 2762, 2377, 16216, 2099, 0...",1
1,6_1,"[[19387, 5490, 6692, 26291, 2050, 2762, 3195, ...",1
2,6_10,"[[2051, 16216, 2099, 1058, 1043, 3270, 0, 0, 0...",1
3,6_100,"[[2208, 9701, 15088, 4590, 2386, 2100, 2088, 1...",1
4,6_101,"[[4380, 16526, 1047, 10483, 2063, 5020, 8923, ...",1
...,...,...,...
511,16_95,"[[2028, 14163, 22592, 2446, 2136, 2180, 2102, ...",1
512,16_96,"[[2088, 15569, 2444, 5034, 2497, 2689, 3672, 2...",1
513,16_97,"[[16216, 2099, 3335, 6531, 26252, 4686, 2123, ...",1
514,16_98,"[[1053, 12273, 2102, 3501, 26952, 16545, 13910...",1


In [145]:
df_eval[['ID', 'EventType']].to_csv('eval_csv_boosting_test_poor_low_data.csv', index=False)

In [143]:
df_eval[['ID', 'EventType']]

Unnamed: 0,ID,EventType
0,6_0,1
1,6_1,1
2,6_10,1
3,6_100,1
4,6_101,1
...,...,...
511,16_95,1
512,16_96,1
513,16_97,1
514,16_98,1


In [None]:
df_eval.to_csv('eval_csv_boosting_test_poor_low_data.csv')

In [None]:
!cp

In [None]:
boosting_model.predict(X_train)

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,

In [None]:
np.array(X_train).shape

(331, 10000)

In [None]:
np.array(X_eval).shape

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (516,) + inhomogeneous part.

In [None]:
type(X_eval)

numpy.ndarray

In [None]:
import numpy as np

df_eval_numpy = np.load("./drive/MyDrive/eval.npy", allow_pickle=True)

In [None]:
import pandas as pd

df_eval_d = pd.DataFrame(df_eval_numpy)

In [None]:
df_eval_d

Unnamed: 0,0,1
0,6_0,"[[2633, 2131, 2156, 2762, 2377, 16216, 2099, 0..."
1,6_1,"[[19387, 5490, 6692, 26291, 2050, 2762, 3195, ..."
2,6_10,"[[2051, 16216, 2099, 1058, 1043, 3270, 0, 0, 0..."
3,6_100,"[[2208, 9701, 15088, 4590, 2386, 2100, 2088, 1..."
4,6_101,"[[4380, 16526, 1047, 10483, 2063, 5020, 8923, ..."
...,...,...
511,16_95,"[[2028, 14163, 22592, 2446, 2136, 2180, 2102, ..."
512,16_96,"[[2088, 15569, 2444, 5034, 2497, 2689, 3672, 2..."
513,16_97,"[[16216, 2099, 3335, 6531, 26252, 4686, 2123, ..."
514,16_98,"[[1053, 12273, 2102, 3501, 26952, 16545, 13910..."


In [None]:
X_eval = df_eval['Tweet'].apply(lambda x: list(itertools.chain.from_iterable(x))[:4000]).to_list()

In [None]:
l = 100000
for x in X_eval:
    if len(x) < l:
      l = len(x)
      print(l)


10000
9600
8832
6624
6208
6112
4224


list

In [None]:
df_eval_d

Unnamed: 0,0,1
0,6_0,"[[2633, 2131, 2156, 2762, 2377, 16216, 2099, 0..."
1,6_1,"[[19387, 5490, 6692, 26291, 2050, 2762, 3195, ..."
2,6_10,"[[2051, 16216, 2099, 1058, 1043, 3270, 0, 0, 0..."
3,6_100,"[[2208, 9701, 15088, 4590, 2386, 2100, 2088, 1..."
4,6_101,"[[4380, 16526, 1047, 10483, 2063, 5020, 8923, ..."
...,...,...
511,16_95,"[[2028, 14163, 22592, 2446, 2136, 2180, 2102, ..."
512,16_96,"[[2088, 15569, 2444, 5034, 2497, 2689, 3672, 2..."
513,16_97,"[[16216, 2099, 3335, 6531, 26252, 4686, 2123, ..."
514,16_98,"[[1053, 12273, 2102, 3501, 26952, 16545, 13910..."


In [None]:
boosting_model.predict(X_eval)

ValueError: setting an array element with a sequence.

In [38]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'early_stopping_rounds' : 100,
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**param)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    preds = model.predict(X_test)
    final_accuracy = accuracy_score(y_test, preds)
    return 1 - final_accuracy


In [39]:
study = optuna.create_study(direction='minimize')  # For minimizing 1 - accuracy check
study.optimize(objective, n_trials=50)  # Adjust the number of trials as needed

# Print the best parameters and score
print("Best trial:")
print(study.best_trial.params)
print(f"Best: { 1 - study.best_value}")

[I 2024-12-12 09:20:49,575] A new study created in memory with name: no-name-1501d36f-08d5-4105-b42b-0d153bf31a53
[W 2024-12-12 09:20:49,582] Trial 0 failed with parameters: {'booster': 'gbtree', 'lambda': 0.967434228966315, 'alpha': 3.1636188517695487, 'colsample_bytree': 0.7393666893428399, 'subsample': 0.8426256367688243, 'learning_rate': 0.09978596991385225, 'max_depth': 3, 'n_estimators': 1000, 'min_child_weight': 4} because of the following error: ValueError('DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:tokens: object').
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-38-d3c98ed8898b>", line 25, in objective
    model.fit(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:tokens: object

In [12]:
study = optuna.create_study(direction='minimize')  # For minimizing accuracy check
study.optimize(objective, n_trials=10)  # Adjust the number of trials as needed

# Print the best parameters and score
print("Best trial:")
print(study.best_trial.params)
print(f"Best: { 1 - study.best_value}")

[I 2024-12-12 08:55:46,656] A new study created in memory with name: no-name-cf55dc65-2bdd-42f4-9b2e-c6fa1cb693e7
[W 2024-12-12 08:55:46,679] Trial 0 failed with parameters: {'booster': 'gbtree', 'lambda': 0.03462338500364572, 'alpha': 9.881528778712438, 'colsample_bytree': 0.600851032331498, 'subsample': 0.6595661447498142, 'learning_rate': 0.010679450526988299, 'max_depth': 3, 'n_estimators': 142, 'min_child_weight': 9} because of the following error: ValueError("Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1813 1814 1815], got ['0_0' '0_1' '0_10' ... '8_96' '8_97' '8_99']").
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-11-f345738a1fb8>", line 25, in objective
    model.fit(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/loc

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1813 1814 1815], got ['0_0' '0_1' '0_10' ... '8_96' '8_97' '8_99']

In [46]:
import requests
import zipfile
import os

# Define the URL for the 400D GloVe embeddings
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
download_dir = 'glove_embeddings'

# Create the directory to store embeddings if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Download the file
response = requests.get(url, stream=True)
zip_file_path = os.path.join(download_dir, 'glove.6B.zip')

with open(zip_file_path, 'wb') as f:
    for chunk in response.iter_content(chunk_size=128):
        f.write(chunk)

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(download_dir)

# Path to the 400D embedding file
embedding_file = os.path.join(download_dir, 'glove.6B.400d.txt')

print(f"GloVe 400D embeddings downloaded and extracted to {embedding_file}")


GloVe 400D embeddings downloaded and extracted to glove_embeddings/glove.6B.400d.txt


In [48]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

# Download the punkt tokenizer if not already installed
# nltk.download('punkt')

# Load the GloVe embeddings (adjust path to where your embeddings are saved)
embedding_file = 'glove_embeddings/glove.6B.300d.txt'  # Change this if using different embeddings

# Create a dictionary to store the word embeddings
embeddings_index = {}

# Load the GloVe embeddings into the dictionary
with open(embedding_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

# Tokenize the input text
def tokenize_text(text):
    return word_tokenize(text.lower())  # Convert to lowercase for consistency

# Function to compute the average embedding of a text
def get_average_embedding(text, embeddings_index):
    tokens = tokenize_text(text)
    embeddings = []

    for word in tokens:
        if word in embeddings_index:
            embeddings.append(embeddings_index[word])

    if embeddings:
        # Average embedding of the text
        return np.mean(embeddings, axis=0)
    else:
        return None  # In case no words were found in the embeddings

# Example usage
text = "Football is a great sport to enjoy with friends."

average_embedding = get_average_embedding(text, embeddings_index)

if average_embedding is not None:
    print("Average embedding shape:", average_embedding.shape)
    print(average_embedding)  # Display the average embedding vector
else:
    print("No words found in the embeddings.")


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
