In [26]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
import nltk
import os
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alena_khg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alena_khg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing punctuation, numbers, stopwords, and lemmatizing.

    Args:
        text (str): Text to be preprocessed.

    Returns:
        str: Preprocessed text.
    """
    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenization
    words = text.split()

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [28]:
def format_number_to_string(number):
    """
    Formats a number into a 3-character string, adding leading zeros if necessary.

    Args:
        number (int): A number between 0 and 999.

    Returns:
        str: A string of length 3.
    """
    if not (0 <= number <= 999):
        raise ValueError("The number must be in the range 0 to 999.")

    return f"{number:03d}"

In [29]:
def process_csv(file_path, l, with_period_id, with_event_type):
    """
    Process a CSV file to extract and tokenize data.

    Args:
        file_path (str): Path to the CSV file.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType', and 'Tweet'.
    """
    # Load the tokenizer (default tokenizer from transformers)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Read the CSV file
    df = pd.read_csv(file_path)

    # Extract required columns
    if with_event_type:
        df = df[['PeriodID', 'EventType', 'Tweet']]
    else:
        df = df[['PeriodID', 'Tweet']]

    if with_period_id:
      # Preprocess text and concatenate with formatted PeriodID
        df['Tweet'] = df.apply(
            lambda row: f"{format_number_to_string(row['PeriodID'])} {preprocess_text(row['Tweet'])}",
            axis=1
        )
    else:
      # Apply preprocessing to each tweet
      df['Tweet'] = df['Tweet'].apply(preprocess_text)

    # Tokenize the 'Tweet' column and pad/truncate to length l
    def tokenize_tweet(tweet):
        tokens = tokenizer.encode(tweet, truncation=True, padding="max_length", max_length=l, add_special_tokens=True)
        return tokens

    df['Tweet'] = df['Tweet'].apply(tokenize_tweet)

    if with_event_type:
        df = df[['EventType', 'Tweet']]
    else:
        df = df[['Tweet']]

    return df

In [31]:
# Read all training files and concatenate them into one dataframe with Period Id in tweet tokens
li = []
for filename in os.listdir("../train_tweets"):
    df_train_with_period_id = process_csv("../train_tweets/" + filename, 128, True, True)
    li.append(df_train_with_period_id)
df_train_with_period_id = pd.concat(li, ignore_index=True)

In [32]:
df_train_with_period_id

Unnamed: 0,EventType,Tweet
0,0,"[101, 2199, 19387, 4715, 27364, 9006, 9686, 23..."
1,0,"[101, 2199, 3942, 2609, 2361, 2880, 4773, 2609..."
2,0,"[101, 2199, 19387, 4715, 27364, 9006, 9686, 23..."
3,0,"[101, 2199, 19387, 8484, 10085, 17119, 22231, ..."
4,0,"[101, 2199, 19387, 4715, 27364, 9006, 8740, 37..."
...,...,...
5056045,1,"[101, 14378, 19387, 4035, 20205, 5978, 2959, 2..."
5056046,1,"[101, 14378, 19387, 6788, 20205, 2015, 3915, 2..."
5056047,1,"[101, 14378, 8923, 2080, 2071, 4089, 3195, 312..."
5056048,1,"[101, 14378, 19387, 2122, 20844, 4263, 11097, ..."


In [33]:
# Read all training files and concatenate them into one dataframe without Period Id in tweet tokens
li = []
for filename in os.listdir("../train_tweets"):
    df_train_without_period_id = process_csv("../train_tweets/" + filename, 128, False, True)
    li.append(df_train_without_period_id)
df_train_without_period_id = pd.concat(li, ignore_index=True)

KeyboardInterrupt: 

In [34]:
# Read all evaluation files and concatenate them into one dataframe with Period Id in tweet tokens
li = []
for filename in os.listdir("../eval_tweets"):
    df_eval_with_period_id = process_csv("../eval_tweets/" + filename, 128, True, False)
    li.append(df_eval_with_period_id)
df_eval_with_period_id = pd.concat(li, ignore_index=True)

KeyboardInterrupt: 

In [35]:
# Read all evaluation files and concatenate them into one dataframe without Period Id in tweet tokens
li = []
for filename in os.listdir("../eval_tweets"):
    df_eval_with_period_id = process_csv("../eval_tweets/" + filename, 128, False, False)
    li.append(df_eval_with_period_id)
df_eval_with_period_id = pd.concat(li, ignore_index=True)

KeyboardInterrupt: 