In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [77]:
!cp -r /content/drive/MyDrive/challenge_data/train_tweets/ ./
!cp -r /content/drive/MyDrive/challenge_data/eval_tweets/ ./

In [2]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
import nltk
import os
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
def new_preprocess_text(text):
    """
    Preprocess text by lowercasing, removing punctuation, numbers, stopwords, and lemmatizing.

    Args:
        text (str): Text to be preprocessed.

    Returns:
        str: Preprocessed text.
    """

    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove hashtags but keep the text
    text = re.sub(r'#', '', text)
    # Remove RTs
    text = re.sub(r'\bRT\b', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\b(worldcup|world|cup|brazil|brasil|mex|neymar|camaroes|bra|cam|cmr)\b', '', text)


    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Lowercasing
    text = text.lower()

    # Tokenization
    words = text.split()

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [4]:
def process_csv_train(file_path):
    """
    Process a CSV file to extract and tokenize data.

    Args:
        file_path (str): Path to the CSV file.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType', and 'Tweet'.
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    df = df[['ID', 'Tweet', 'EventType']]

    df = df.drop_duplicates(subset='Tweet', keep='first').reset_index(drop=True)
    df['Tweet'] = df['Tweet'].apply(new_preprocess_text)
    df = df.drop_duplicates(subset='Tweet', keep='first').reset_index(drop=True)

    return df

In [75]:
def process_csv_eval(file_path):
    """
    Process a CSV file to extract and tokenize data.

    Args:
        file_path (str): Path to the CSV file.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType', and 'Tweet'.
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    df = df[['ID', 'Tweet']]
    df = df.drop_duplicates(subset='Tweet', keep='first').reset_index(drop=True)
    df['Tweet'] = df['Tweet'].apply(new_preprocess_text)
    df = df.drop_duplicates(subset='Tweet', keep='first').reset_index(drop=True)

    return df

In [5]:
def read_csv(folder_path, process):
    """
    Read all CSV files in a folder and process them.

    Args:
        folder_path (str): Path to the folder containing the CSV files.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType' (optional), and 'Tweet' (tokenized).
    """
    li = []
    for filename in os.listdir(folder_path)[:1]:
        df = process(folder_path + filename)
        li.append(df)
    df = pd.concat(li, ignore_index=True)
    return df

In [11]:
df = read_csv("./train_tweets/", process_csv_train)

In [13]:
df

Unnamed: 0,ID,Tweet,EventType
0,2_0,esp beat au well give away spain jersey one lu...,0
1,2_0,visit sitep official web site spanis real stat...,0
2,2_0,winner au v esp match well give away jersey en...,0
3,2_0,au beat esp well give away australia jersey on...,0
4,2_0,record iniesta spain game w l fcblive esp goal...,0
...,...,...,...
18569,2_129,stats australia spain ausesp worldcup groupb a...,1
18570,2_129,spain bahar ho gai na whatstill esp supporter,1
18571,2_129,mata scored goal doesnt mata esp,1
18572,2_129,two brilliant game football glad watched first...,1


In [14]:
df_new = df.groupby(['ID', 'EventType'])['Tweet'].apply(lambda x: ' '.join(x)).reset_index()

In [78]:
df_eval = read_csv("./eval_tweets/", process_csv_eval)

In [84]:
df_eval_new = df_eval.groupby(['ID'])['Tweet'].apply(lambda x: ' '.join(x)).reset_index()

In [279]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from tqdm.auto import tqdm  # Added tqdm import

import torch.optim as optim
import torch.nn as nn
import random

class TweetSentenceClassifier:
    def __init__(self, model_name='distilbert-base-uncased', num_labels=None):
        """
        Initialize the Tweet Sentence Classifier

        :param model_name: Pretrained model to use as base
        :param num_labels: Number of classification categories
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # If num_labels is not specified, it will be inferred from the dataset
        self.num_labels = num_labels
        self.model = None

    def prepare_dataset(self, df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """
        # Infer number of labels if not specified
        if self.num_labels is None:
            self.num_labels = len(df[label_column].unique())

        # Initialize model with correct number of labels
        self.model = AutoModelForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=self.num_labels
        ).to(self.device)

        # Tokenization function
        def tokenize_function(text):
            return self.tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=512
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df

    def prepare_general_dataset(self, df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare general dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """
        # Infer number of labels if not specified
        if self.num_labels is None:
            self.num_labels = len(df[label_column].unique())

        # Initialize model with correct number of labels
        self.model = AutoModelForSequenceClassification.from_pretrained(
            'distilbert-base-uncased',
            num_labels=self.num_labels
        ).to(self.device)

        # Tokenization function
        def tokenize_function(text):
            return self.tokenizer(
                text,
                truncation=True,
                max_length=len(text)*10
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df


    def prepare_loaders_from_generals(self, train_df, test_df, label_column, max_length=512):
        def process_tokens(tokenized_items):
            processed_input_ids = []
            processed_attention_masks = []

            for item in tokenized_items:
                input_ids = item['input_ids']
                attention_mask = item['attention_mask']

                # If sequence is shorter than 512, pad
                if len(input_ids) <= max_length:
                    pad_length = max_length - len(input_ids)
                    padded_input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_length
                    padded_attention_mask = attention_mask + [0] * pad_length
                    processed_input_ids.append(padded_input_ids)
                    processed_attention_masks.append(padded_attention_mask)

                # If sequence is longer than 512, choose random start
                else:
                    # Calculate the maximum possible start index
                    max_start_index = len(input_ids) - max_length

                    # Choose a random start index
                    start_index = random.randint(0, max_start_index)

                    # Extract 512 consecutive tokens
                    windowed_input_ids = input_ids[start_index:start_index + max_length]
                    windowed_attention_mask = attention_mask[start_index:start_index + max_length]

                    processed_input_ids.append(windowed_input_ids)
                    processed_attention_masks.append(windowed_attention_mask)

            return processed_input_ids, processed_attention_masks

        # Ensure random seed for reproducibility
        # random.seed(42)

        # Process train and test data
        train_input_ids, train_attention_mask = process_tokens(train_df['tokenized'])
        test_input_ids, test_attention_mask = process_tokens(test_df['tokenized'])

        # Convert to tensors
        train_input_ids = torch.tensor(train_input_ids)
        train_attention_mask = torch.tensor(train_attention_mask)
        train_labels = torch.tensor(train_df[label_column].values)

        test_input_ids = torch.tensor(test_input_ids)
        test_attention_mask = torch.tensor(test_attention_mask)
        test_labels = torch.tensor(test_df[label_column].values)

        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            train_input_ids,
            train_attention_mask,
            train_labels
        )
        test_dataset = torch.utils.data.TensorDataset(
            test_input_ids,
            test_attention_mask,
            test_labels
        )

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=False
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False
        )

        return train_loader, test_loader


    def compute_metrics(self, pred):
        """
        Compute evaluation metrics

        :param pred: Predictions from the model
        :return: Dictionary of metrics
        """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)

        # Compute classification report
        report = classification_report(labels, preds, output_dict=True)

        return {
            'accuracy': report['accuracy'],
            'macro_f1': report['macro avg']['f1-score'],
            'weighted_f1': report['weighted avg']['f1-score']
        }

    def custom_loader_train(self, train_df, test_df, label_column, epochs=1, learning_rate=2e-5):
        """
        Custom training method with tqdm progress bars

        :param train_df: Training dataframe with tokenized column
        :param test_df: Test dataframe with tokenized column
        :param label_column: Name of the label column
        :param epochs: Number of training epochs
        :param learning_rate: Learning rate for optimizer
        """

        # test_input_ids = torch.tensor([x['input_ids'] for x in test_df['tokenized']])
        # test_attention_mask = torch.tensor([x['attention_mask'] for x in test_df['tokenized']])
        # test_labels = torch.tensor(test_df[label_column].values)

        # # Create data loaders

        # test_dataset = torch.utils.data.TensorDataset(
        #     test_input_ids,
        #     test_attention_mask,
        #     test_labels
        # )

        # test_loader = torch.utils.data.DataLoader(
        #     test_dataset,
        #     batch_size=32,
        #     shuffle=False
        # )



        # Prepare optimizer and loss
        optimizer = optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop with tqdm progress bars
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            train_loader, test_loader = self.prepare_loaders_from_generals(train_df, test_df, label_column)


            # Wrap train_loader with tqdm for progress bar
            train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Training)", leave=False)
            for batch in train_progress_bar:
                # Unpack batch
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits

                # Backward pass
                loss.backward()
                optimizer.step()

                # Compute accuracy
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()

                # Update progress bar
                train_progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Accuracy': f'{100 * train_correct / train_total:.2f}%'
                })

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            all_preds = []
            all_labels = []

            # Wrap test_loader with tqdm for progress bar
            val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} (Validation)", leave=False)
            with torch.no_grad():
                for batch in val_progress_bar:
                    input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss
                    logits = outputs.logits

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                    # Update progress bar
                    val_progress_bar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Accuracy': f'{100 * val_correct / val_total:.2f}%'
                    })

            # Print epoch statistics
            # print(f"Epoch {epoch+1}/{epochs}")
            # print(f"Train Loss: {train_loss/len(train_loader):.4f}")
            # print(f"Train Accuracy: {100 * train_correct / train_total:.2f}%")
            # print(f"Val Loss: {val_loss/len(test_loader):.4f}")
            # print(f"Val Accuracy: {100 * val_correct / val_total:.2f}%")

            # Compute detailed classification report
            # report = classification_report(all_labels, all_preds)
            # print("Classification Report:")
            # print(report)

        return self.model

    def custom_train(self, train_df, test_df, label_column, epochs=1, learning_rate=2e-5):
        """
        Custom training method with tqdm progress bars

        :param train_df: Training dataframe with tokenized column
        :param test_df: Test dataframe with tokenized column
        :param label_column: Name of the label column
        :param epochs: Number of training epochs
        :param learning_rate: Learning rate for optimizer
        """
        # Prepare data tensors
        train_input_ids = torch.tensor([x['input_ids'] for x in train_df['tokenized']])
        train_attention_mask = torch.tensor([x['attention_mask'] for x in train_df['tokenized']])
        train_labels = torch.tensor(train_df[label_column].values)

        test_input_ids = torch.tensor([x['input_ids'] for x in test_df['tokenized']])
        test_attention_mask = torch.tensor([x['attention_mask'] for x in test_df['tokenized']])
        test_labels = torch.tensor(test_df[label_column].values)

        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            train_input_ids,
            train_attention_mask,
            train_labels
        )
        test_dataset = torch.utils.data.TensorDataset(
            test_input_ids,
            test_attention_mask,
            test_labels
        )

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False
        )

        # Prepare optimizer and loss
        optimizer = optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop with tqdm progress bars
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            # Wrap train_loader with tqdm for progress bar
            train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Training)", leave=False)
            for batch in train_progress_bar:
                # Unpack batch
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits

                # Backward pass
                loss.backward()
                optimizer.step()

                # Compute accuracy
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()

                # Update progress bar
                train_progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Accuracy': f'{100 * train_correct / train_total:.2f}%'
                })

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            all_preds = []
            all_labels = []

            # Wrap test_loader with tqdm for progress bar
            val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} (Validation)", leave=False)
            with torch.no_grad():
                for batch in val_progress_bar:
                    input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss
                    logits = outputs.logits

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                    # Update progress bar
                    val_progress_bar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Accuracy': f'{100 * val_correct / val_total:.2f}%'
                    })

            # Print epoch statistics
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"Train Loss: {train_loss/len(train_loader):.4f}")
            print(f"Train Accuracy: {100 * train_correct / train_total:.2f}%")
            print(f"Val Loss: {val_loss/len(test_loader):.4f}")
            print(f"Val Accuracy: {100 * val_correct / val_total:.2f}%")

            # Compute detailed classification report
            # report = classification_report(all_labels, all_preds)
            print("Classification Report:")
            # print(report)

        return self.model

    def predict(self, texts):
        """
        Make predictions on new texts

        :param texts: List of text strings to classify
        :return: Predictions and their probabilities
        """
        # Tokenize inputs
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        # Get model predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1)

        return predictions.cpu().numpy(), probs.cpu().numpy()

    def predict_iterate(self, texts):
        """
        Make predictions on new texts

        :param texts: List of text strings to classify
        :return: Predictions and their probabilities
        """
        # Tokenize inputs
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        # Get model predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1)

        return predictions.cpu().numpy(), probs.cpu().numpy()

In [280]:
# Initialize classifier
classifier = TweetSentenceClassifier(num_labels=2)  # Specify number of labels

# Prepare dataset (adjust column names as needed)
train_dataset, test_dataset = classifier.prepare_general_dataset(
    df_new,
    text_column='Tweet',
    label_column='EventType'
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [212]:
train_dataset['tokenized'].apply(lambda x: len(x['input_ids'])).max()

5502

In [281]:
# Train the model
model = classifier.custom_loader_train(train_dataset, test_dataset, 'EventType', 100)

Epoch 1/100 (Training):   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1/100 (Validation):   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 2/100 (Training):   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 2/100 (Validation):   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 3/100 (Training):   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 3/100 (Validation):   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 4/100 (Training):   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 4/100 (Validation):   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 5/100 (Training):   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 5/100 (Validation):   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 6/100 (Training):   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [214]:
new_tweets = [
    "goal",
    "mex waj oawjd oawdwd wd wefe wfaw efawf aw daw d awd"
]

predictions, probabilities = classifier.predict(test_dataset['Tweet'][:20].to_list())

In [215]:
predictions, probabilities

(array([0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1]),
 array([[0.9780553 , 0.02194476],
        [0.96877444, 0.03122558],
        [0.8743157 , 0.12568432],
        [0.09169652, 0.9083035 ],
        [0.0523311 , 0.9476689 ],
        [0.9814751 , 0.01852493],
        [0.96742713, 0.03257286],
        [0.9553782 , 0.04462187],
        [0.3227036 , 0.6772964 ],
        [0.3083719 , 0.6916281 ],
        [0.0275135 , 0.9724865 ],
        [0.72343665, 0.27656338],
        [0.97234356, 0.02765646],
        [0.01550411, 0.9844959 ],
        [0.94575834, 0.05424168],
        [0.03905738, 0.9609426 ],
        [0.9782516 , 0.02174848],
        [0.07622375, 0.92377627],
        [0.92557085, 0.07442907],
        [0.03492583, 0.9650742 ]], dtype=float32))

In [73]:
test_dataset['Tweet'][:20].to_list(), test_dataset['EventType'][:20].to_list(),

(['fu vicinte del bosque playing david villa first two game esp worldcup espvsaus',
  'waited long esp lead cheeky backheel david villa',
  'day worldcup schedule groupa groupb ned v chi au v esp cro v mex cmr v bra',
  'thank david villa legend esp',
  'david villa emotional make last walk vivaspain vivalaroja esp',
  'esp finally winning aye',
  'au xl v esp ryan davidson spiranovic leckie taggart oar bozanic jedinak mckay mcgowan wilkinson',
  'villa still got simply great player esp',
  'surprised au playing less inspired esp',
  'thanks memory villa take bow esp',
  'congrats holland best group b net chi esp au',
  'esp v au may last game others red enjoy game',
  'still cheering esp regardless',
  'great sending absolutely legendary team esp',
  'esp doesnt even care',
  'trying watch au v esp ned v chi time go au go chi',
  'im watching au v esp game love watching australia play',
  'esp au nd half spain get play back underway wo',
  'villa better torres costadel bosque fail p e

In [217]:
classifier.predict(df_eval_new['Tweet'].to_list())

(array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([[0.9636142 , 0.03638574],
        [0.9755602 , 0.02443982],
        [0.9670141 , 0.032986  ],
        [0.6347902 , 0.36520982],
        [0.17263636, 0.82736367],
        [0.85785216, 0.14214784],
        [0.9308661 , 0.06913389],
        [0.971106  , 0.02889403],
        [0.9749442 , 0.02505585],
        [0.975608  , 0.02439205],
        [0.9704283 , 0.02957177],
        [0.96245944, 0.03754052],
        [0.9684733 , 0.03152668],
        [0.97529626, 0.0247037 ],
        [0.97431654, 0.02568352],
        [0.9677022 , 0.03229772],
        [0.97458

In [86]:
df_eval_new['EventType'] = classifier.predict(df_eval_new['Tweet'].to_list())[0]

In [218]:
df

Unnamed: 0,ID,Tweet,EventType
0,2_0,esp beat au well give away spain jersey one lu...,0
1,2_0,visit sitep official web site spanis real stat...,0
2,2_0,winner au v esp match well give away jersey en...,0
3,2_0,au beat esp well give away australia jersey on...,0
4,2_0,record iniesta spain game w l fcblive esp goal...,0
...,...,...,...
18569,2_129,stats australia spain ausesp worldcup groupb a...,1
18570,2_129,spain bahar ho gai na whatstill esp supporter,1
18571,2_129,mata scored goal doesnt mata esp,1
18572,2_129,two brilliant game football glad watched first...,1


In [261]:
new_df = df[df['EventType']==1]

In [265]:
new_df = new_df.reset_index()

In [269]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset

# Define the words to filter by
words = ['full time  goal half time kick off owngoal penalty red card yellow card']

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the words and tweets
word_embeddings = model.encode(words)
tweet_embeddings = model.encode(new_df['Tweet'])

# Calculate cosine similarity between each tweet and word
similarity_matrix = cosine_similarity(tweet_embeddings, word_embeddings)

# Filter tweets based on a similarity threshold
threshold = 0.1
filtered_indices = np.where(np.max(similarity_matrix, axis=1) >= threshold)[0]

filtered_tweets = new_df.iloc[filtered_indices]

Unnamed: 0,level_0,index,ID,Tweet,EventType
0,0,243,2_2,live world cup match au v esp itv net v chi it...,1
1,1,244,2_2,torres starting today oh esp,1
2,2,245,2_2,come au let give spain wooden spoon,1
3,3,246,2_2,today worldcup match au esp ned chi cmr bra cr...,1
4,4,247,2_2,represented attack today adam taggart au david...,1
...,...,...,...,...,...
13032,13032,18569,2_129,stats australia spain ausesp worldcup groupb a...,1
13033,13033,18570,2_129,spain bahar ho gai na whatstill esp supporter,1
13034,13034,18571,2_129,mata scored goal doesnt mata esp,1
13035,13035,18572,2_129,two brilliant game football glad watched first...,1


In [270]:
similarity_matrix

array([[0.26053518],
       [0.26729992],
       [0.12990883],
       ...,
       [0.30404204],
       [0.35741645],
       [0.37590963]], dtype=float32)

In [276]:
filtered_indices = np.where(np.max(similarity_matrix, axis=1) >= threshold)[0]

In [277]:
len(filtered_indices)

12363

In [278]:
filtered_tweets = new_df.iloc[filtered_indices]

print(filtered_tweets['Tweet'])

0        live world cup match au v esp itv net v chi it...
1                             torres starting today oh esp
2                      come au let give spain wooden spoon
3        today worldcup match au esp ned chi cmr bra cr...
4        represented attack today adam taggart au david...
                               ...                        
13032    stats australia spain ausesp worldcup groupb a...
13033        spain bahar ho gai na whatstill esp supporter
13034                     mata scored goal doesnt mata esp
13035    two brilliant game football glad watched first...
13036    end road au esp catching early flight back hom...
Name: Tweet, Length: 12363, dtype: object
