In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [30]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
import nltk
import os
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
def new_preprocess_text(text):
    """
    Preprocess text by lowercasing, removing punctuation, numbers, stopwords, and lemmatizing.

    Args:
        text (str): Text to be preprocessed.

    Returns:
        str: Preprocessed text.
    """

    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove hashtags but keep the text
    text = re.sub(r'#', '', text)
    # Remove RTs
    text = re.sub(r'\bRT\b', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\b(worldcup|world|cup|brazil|brasil|mex|neymar|camaroes|bra|cam|cmr)\b', '', text)


    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Lowercasing
    text = text.lower()

    # Tokenization
    words = text.split()

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [26]:
!cp -r /content/drive/MyDrive/challenge_data/train_tweets/ ./
!cp -r /content/drive/MyDrive/challenge_data/eval_tweets/ ./

In [32]:
def process_csv_eval(file_path):
    """
    Process a CSV file to extract and tokenize data.

    Args:
        file_path (str): Path to the CSV file.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType', and 'Tweet'.
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    df = df[['ID', 'Tweet']]

    df['Tweet'] = df['Tweet'].apply(new_preprocess_text)

    return df

In [33]:
def read_csv(folder_path, process):
    """
    Read all CSV files in a folder and process them.

    Args:
        folder_path (str): Path to the folder containing the CSV files.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType' (optional), and 'Tweet' (tokenized).
    """
    li = []
    for filename in os.listdir(folder_path):
        df = process(folder_path + filename)
        li.append(df)
    df = pd.concat(li, ignore_index=True)
    return df

In [34]:
df_eval = read_csv("./eval_tweets/", process_csv_eval)

In [35]:
df_eval

Unnamed: 0,ID,Tweet
0,6_0,finally get see germany play ger
1,6_0,boateng brother score today well give away pai...
2,6_0,fascinated gervsgha match tell u lot chance us...
3,6_0,ger gha
4,6_0,boateng grudge match jermaine score ger kevinp...
...,...,...
1072923,16_129,let go usa
1072924,16_129,another upset wc srb beat ger
1072925,16_129,ger srb final whistle sound
1072926,16_129,dukung yg menang fra arg hbu


In [36]:
df_eval = df_eval.drop_duplicates(subset='Tweet', keep='first').reset_index(drop=True)

In [41]:
df_eval_new = df_eval.groupby(['ID'])['Tweet'].apply(lambda x: ' '.join(x)).reset_index()

In [47]:
df_eval_new['Tweet'].apply(len).argmax()

29

In [52]:
np.save('drive/MyDrive/cleaned_eval_dataset_training.npy', df_eval_new.to_numpy())

In [7]:
import numpy as np
import pandas as pd

df_n = np.load('drive/MyDrive/cleaned_train_dataset_megafinal_processed.npy', allow_pickle=True)

In [8]:
df = pd.DataFrame(df_n)

In [9]:
df

Unnamed: 0,0,1,2
0,0_0,0,pick side retweet hon favorite sui honvssui gr...
1,0_1,0,first dilemma ecuador v france honduras v swit...
2,0_10,1,right france v ecuador main match tonight look...
3,0_100,1,hattrick shaqiri honduras switzerland safe tri...
4,0_101,1,hon valladares need move overcant support garb...
...,...,...,...
2132,8_95,0,great shape top group stop goal assist game ha...
2133,8_96,1,pk cro game joke pk cro game nowtheyreeven dea...
2134,8_97,1,one best game tournament take account whole ga...
2135,8_98,0,great choice referee congrats cro guess lost p...


In [10]:
df[1].apply(len).max()

138

In [13]:
df[1] = df[1].apply(new_preprocess_text)

In [16]:
np.save('drive/MyDrive/cleaned_train_dataset_megafinal.npy', df.to_numpy())

In [7]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset

def clean_bad_events(df, threshold=0.1):
    # Define the words to filter by
    words = ['full time  goal half time kick off owngoal penalty red card yellow card']
    # Load a pre-trained sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode the words and tweets
    word_embeddings = model.encode(words)

    # Filter rows where EventType = 1
    filtered_df = df[df[2] == 1].reset_index()

    # Encode the filtered tweets
    filtered_tweet_embeddings = model.encode(filtered_df[1])

    # Calculate cosine similarity for filtered tweets
    filtered_similarity_matrix = cosine_similarity(filtered_tweet_embeddings, word_embeddings)

    # Filter based on similarity threshold
    filtered_indices = np.where(np.max(filtered_similarity_matrix, axis=1) >= threshold)[0]

    # Get the final filtered tweets
    final_filtered_df = filtered_df.iloc[filtered_indices]


    return final_filtered_df

  from tqdm.autonotebook import tqdm, trange


In [9]:
final_filtered_df = clean_bad_events(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
len(final_filtered_df)/len(df[df[2] == 1])

0.9032246751521826

In [14]:
# merged_df = pd.merge(df[df[2] == 0], final_filtered_df, on=1, how='inner')
concatenated_df = pd.concat([df[df[2] == 0], final_filtered_df], ignore_index=True).reset_index()
concatenated_df

Unnamed: 0,0,1,2,index
0,8_0,v oscar score first well give away shirt choic...,0,
1,8_0,score v well award pair hypervenom one lucky w...,0,
2,8_0,winner v match well give away jersey enter fol...,0,
3,8_0,score st v give away different color magistas ...,0,
4,8_0,got pretty sharp lady,0,
...,...,...,...,...
2205257,10_179,congrats germanywe still war ger,1,2334952.0
2205258,10_179,bet argentina player get nightmare hear happy ...,1,2334953.0
2205259,10_179,u idea turn germany,1,2334954.0
2205260,10_179,zee argies beat u soo cry argentina gro like,1,2334955.0


In [17]:
df_new = concatenated_df.groupby([0, 2])[1].apply(lambda x: ' '.join(x)).reset_index()

In [18]:
df_new

Unnamed: 0,0,2,1
0,0_0,0,pick side retweet hon favorite sui honvssui gr...
1,0_1,0,first dilemma ecuador v france honduras v swit...
2,0_10,1,right france v ecuador main match tonight look...
3,0_100,1,hattrick shaqiri honduras switzerland safe tri...
4,0_101,1,hon valladares need move overcant support garb...
...,...,...,...
2132,8_95,0,great shape top group stop goal assist game ha...
2133,8_96,1,pk cro game joke pk cro game nowtheyreeven dea...
2134,8_97,1,one best game tournament take account whole ga...
2135,8_98,0,great choice referee congrats cro guess lost p...


In [20]:
np.save('drive/MyDrive/cleaned_train_dataset_megafinal_processed.npy', df_new.to_numpy())

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from tqdm.auto import tqdm  # Added tqdm import

import torch.optim as optim
import torch.nn as nn
import random

class TweetSentenceClassifier:
    def __init__(self, model_name='distilbert-base-uncased', num_labels=None):
        """
        Initialize the Tweet Sentence Classifier

        :param model_name: Pretrained model to use as base
        :param num_labels: Number of classification categories
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # If num_labels is not specified, it will be inferred from the dataset
        self.num_labels = num_labels
        self.model = None

    def prepare_dataset(self, df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """
        # Infer number of labels if not specified
        if self.num_labels is None:
            self.num_labels = len(df[label_column].unique())

        # Initialize model with correct number of labels
        self.model = AutoModelForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=self.num_labels
        ).to(self.device)

        # Tokenization function
        def tokenize_function(text):
            return self.tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=512
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df

    def prepare_general_dataset(self, df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare general dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """
        # Infer number of labels if not specified
        if self.num_labels is None:
            self.num_labels = len(df[label_column].unique())

        # Initialize model with correct number of labels
        self.model = AutoModelForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=self.num_labels
        ).to(self.device)

        # Tokenization function
        def tokenize_function(text):
            return self.tokenizer(
                text,
                truncation=True,
                max_length=len(text)*10
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df


    def prepare_loaders_from_generals(self, train_df, test_df, label_column, max_length=512):
        def process_tokens(tokenized_items):
            processed_input_ids = []
            processed_attention_masks = []

            for item in tokenized_items:
                input_ids = item['input_ids']
                attention_mask = item['attention_mask']

                # If sequence is shorter than 512, pad
                if len(input_ids) <= max_length:
                    pad_length = max_length - len(input_ids)
                    padded_input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_length
                    padded_attention_mask = attention_mask + [0] * pad_length
                    processed_input_ids.append(padded_input_ids)
                    processed_attention_masks.append(padded_attention_mask)

                # If sequence is longer than 512, choose random start
                else:
                    # Calculate the maximum possible start index
                    max_start_index = len(input_ids) - max_length

                    # Choose a random start index
                    start_index = random.randint(0, max_start_index)

                    # Extract 512 consecutive tokens
                    windowed_input_ids = input_ids[start_index:start_index + max_length]
                    windowed_attention_mask = attention_mask[start_index:start_index + max_length]

                    processed_input_ids.append(windowed_input_ids)
                    processed_attention_masks.append(windowed_attention_mask)

            return processed_input_ids, processed_attention_masks

        # Ensure random seed for reproducibility
        # random.seed(42)

        # Process train and test data
        train_input_ids, train_attention_mask = process_tokens(train_df['tokenized'])
        test_input_ids, test_attention_mask = process_tokens(test_df['tokenized'])

        # Convert to tensors
        train_input_ids = torch.tensor(train_input_ids)
        train_attention_mask = torch.tensor(train_attention_mask)
        # train_labels = torch.tensor(train_df[label_column].values)

        test_input_ids = torch.tensor(test_input_ids)
        test_attention_mask = torch.tensor(test_attention_mask)
        # test_labels = torch.tensor(test_df[label_column].values)

        train_labels = torch.tensor(np.array(train_df[label_column].to_list(), dtype=np.int64), dtype=torch.long)
        test_labels = torch.tensor(np.array(test_df[label_column].to_list(), dtype=np.int64), dtype=torch.long)

        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            train_input_ids,
            train_attention_mask,
            train_labels
        )
        test_dataset = torch.utils.data.TensorDataset(
            test_input_ids,
            test_attention_mask,
            test_labels
        )

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=False
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False
        )

        return train_loader, test_loader


    def compute_metrics(self, pred):
        """
        Compute evaluation metrics

        :param pred: Predictions from the model
        :return: Dictionary of metrics
        """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)

        # Compute classification report
        report = classification_report(labels, preds, output_dict=True)

        return {
            'accuracy': report['accuracy'],
            'macro_f1': report['macro avg']['f1-score'],
            'weighted_f1': report['weighted avg']['f1-score']
        }

    def custom_loader_train(self, train_df, test_df, label_column, epochs=1, learning_rate=2e-5):
        """
        Custom training method with tqdm progress bars

        :param train_df: Training dataframe with tokenized column
        :param test_df: Test dataframe with tokenized column
        :param label_column: Name of the label column
        :param epochs: Number of training epochs
        :param learning_rate: Learning rate for optimizer
        """

        # test_input_ids = torch.tensor([x['input_ids'] for x in test_df['tokenized']])
        # test_attention_mask = torch.tensor([x['attention_mask'] for x in test_df['tokenized']])
        # test_labels = torch.tensor(test_df[label_column].values)

        # # Create data loaders

        # test_dataset = torch.utils.data.TensorDataset(
        #     test_input_ids,
        #     test_attention_mask,
        #     test_labels
        # )

        # test_loader = torch.utils.data.DataLoader(
        #     test_dataset,
        #     batch_size=32,
        #     shuffle=False
        # )



        # Prepare optimizer and loss
        optimizer = optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop with tqdm progress bars
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            train_loader, test_loader = self.prepare_loaders_from_generals(train_df, test_df, label_column)


            # Wrap train_loader with tqdm for progress bar
            train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Training)", leave=False)
            for batch in train_progress_bar:
                # Unpack batch
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits

                # Backward pass
                loss.backward()
                optimizer.step()

                # Compute accuracy
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()

                # Update progress bar
                train_progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Accuracy': f'{100 * train_correct / train_total:.2f}%'
                })

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            all_preds = []
            all_labels = []

            # Wrap test_loader with tqdm for progress bar
            val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} (Validation)", leave=False)
            with torch.no_grad():
                for batch in val_progress_bar:
                    input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss
                    logits = outputs.logits

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                    # Update progress bar
                    val_progress_bar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Accuracy': f'{100 * val_correct / val_total:.2f}%'
                    })

            # Print epoch statistics
            # print(f"Epoch {epoch+1}/{epochs}")
            # print(f"Train Loss: {train_loss/len(train_loader):.4f}")
            # print(f"Train Accuracy: {100 * train_correct / train_total:.2f}%")
            # print(f"Val Loss: {val_loss/len(test_loader):.4f}")
            # print(f"Val Accuracy: {100 * val_correct / val_total:.2f}%")

            # Compute detailed classification report
            # report = classification_report(all_labels, all_preds)
            # print("Classification Report:")
            # print(report)

        return self.model

    def custom_train(self, train_df, test_df, label_column, epochs=1, learning_rate=2e-5):
        """
        Custom training method with tqdm progress bars

        :param train_df: Training dataframe with tokenized column
        :param test_df: Test dataframe with tokenized column
        :param label_column: Name of the label column
        :param epochs: Number of training epochs
        :param learning_rate: Learning rate for optimizer
        """
        # Prepare data tensors
        train_input_ids = torch.tensor([x['input_ids'] for x in train_df['tokenized']])
        train_attention_mask = torch.tensor([x['attention_mask'] for x in train_df['tokenized']])
        train_labels = torch.tensor(train_df[label_column].values)

        test_input_ids = torch.tensor([x['input_ids'] for x in test_df['tokenized']])
        test_attention_mask = torch.tensor([x['attention_mask'] for x in test_df['tokenized']])
        test_labels = torch.tensor(test_df[label_column].values)

        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            train_input_ids,
            train_attention_mask,
            train_labels
        )
        test_dataset = torch.utils.data.TensorDataset(
            test_input_ids,
            test_attention_mask,
            test_labels
        )

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False
        )

        # Prepare optimizer and loss
        optimizer = optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop with tqdm progress bars
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            # Wrap train_loader with tqdm for progress bar
            train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Training)", leave=False)
            for batch in train_progress_bar:
                # Unpack batch
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits

                # Backward pass
                loss.backward()
                optimizer.step()

                # Compute accuracy
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()

                # Update progress bar
                train_progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Accuracy': f'{100 * train_correct / train_total:.2f}%'
                })

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            all_preds = []
            all_labels = []

            # Wrap test_loader with tqdm for progress bar
            val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} (Validation)", leave=False)
            with torch.no_grad():
                for batch in val_progress_bar:
                    input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss
                    logits = outputs.logits

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                    # Update progress bar
                    val_progress_bar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Accuracy': f'{100 * val_correct / val_total:.2f}%'
                    })

            # Print epoch statistics
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"Train Loss: {train_loss/len(train_loader):.4f}")
            print(f"Train Accuracy: {100 * train_correct / train_total:.2f}%")
            print(f"Val Loss: {val_loss/len(test_loader):.4f}")
            print(f"Val Accuracy: {100 * val_correct / val_total:.2f}%")

            # Compute detailed classification report
            # report = classification_report(all_labels, all_preds)
            print("Classification Report:")
            # print(report)

        return self.model

    def predict(self, texts, start=0):
        """
        Make predictions on new texts

        :param texts: List of text strings to classify
        :return: Predictions and their probabilities
        """
        # Tokenize inputs
        inputs = self.tokenizer(
            texts[start:],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        # Get model predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1)

        return predictions.cpu().numpy(), probs.cpu().numpy()

    def predict_iterate(self, texts):
        """
        Make predictions on new texts

        :param texts: List of text strings to classify
        :return: Predictions and their probabilities
        """
        # Tokenize inputs
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        # Get model predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1)

        return predictions.cpu().numpy(), probs.cpu().numpy()

In [15]:
# Initialize classifier
classifier = TweetSentenceClassifier(num_labels=2)  # Specify number of labels

# Prepare dataset (adjust column names as needed)
train_dataset, test_dataset = classifier.prepare_general_dataset(
    df,
    text_column=2,
    label_column=1
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Train the model
model = classifier.custom_loader_train(train_dataset, test_dataset, 1, )

Epoch 1/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 1/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 2/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 2/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 3/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 3/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 4/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 4/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 5/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 5/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 6/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 6/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 7/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 7/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 8/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 8/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 9/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 9/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 10/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 10/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 11/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 11/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 12/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 12/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 13/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 13/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 14/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 14/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 15/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 15/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 16/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 16/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 17/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 17/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 18/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 18/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 19/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 19/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 20/20 (Training):   0%|          | 0/54 [00:00<?, ?it/s]

Epoch 20/20 (Validation):   0%|          | 0/14 [00:00<?, ?it/s]

In [54]:
predictions, probabilities = classifier.predict(df_eval_new['Tweet'][:20].to_list())

In [19]:
df_eval['EventType'] = classifier.predict(df_eval['Tweet'].to_list())[0]

NameError: name 'df_eval' is not defined

In [59]:
from collections import Counter

Counter(df_eval_new['EventType'])

Counter({0: 424, 1: 92})

In [60]:
predictions, probabilities

(array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]),
 array([[0.97576356, 0.0242365 ],
        [0.23313764, 0.7668624 ],
        [0.886561  , 0.11343905],
        [0.65682214, 0.34317783],
        [0.8869003 , 0.11309972],
        [0.9765999 , 0.02340011],
        [0.91666776, 0.08333227],
        [0.97117203, 0.02882802],
        [0.64504737, 0.35495263],
        [0.7895807 , 0.21041925],
        [0.75946593, 0.2405341 ],
        [0.95754486, 0.04245519],
        [0.8116191 , 0.18838088],
        [0.96794355, 0.03205652],
        [0.58525306, 0.4147469 ],
        [0.65829104, 0.34170893],
        [0.58240956, 0.41759047],
        [0.20356722, 0.79643273],
        [0.30265182, 0.69734824],
        [0.27001643, 0.7299836 ]], dtype=float32))

In [74]:
def predict(classifier, texts):
    """
    Make predictions on new texts

    :param texts: List of text strings to classify
    :return: Predictions and their probabilities
    """
    # Tokenize inputs
    inputs = classifier.tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(classifier.device)

    # Get model predictions
    classifier.model.eval()
    with torch.no_grad():
        outputs = classifier.model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        predictions = torch.argmax(probs, dim=1)

    return predictions.cpu().numpy(), probs.cpu().numpy()

In [79]:
Counter(predict(classifier, df_eval_new['Tweet'].apply(lambda x: x[4000:]).to_list())[0])

Counter({0: 420, 1: 96})

In [84]:
df_eval_new[['ID','EventType']].to_csv('data_bert_20min.csv', index=False)

In [21]:
torch.save(classifier.model.state_dict(), 'drive/MyDrive/poor_bert_v2.pth')