In [None]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess_text(text):
    """
    Preprocess text by lowercasing, removing punctuation, numbers, stopwords, and lemmatizing.

    Args:
        text (str): Text to be preprocessed.

    Returns:
        str: Preprocessed text.
    """
    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenization
    words = text.split()

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [None]:
def format_number_to_string(number):
    """
    Formats a number into a 3-character string, adding leading zeros if necessary.

    Args:
        number (int): A number between 0 and 999.

    Returns:
        str: A string of length 3.
    """
    if not (0 <= number <= 999):
        raise ValueError("The number must be in the range 0 to 999.")

    return f"{number:03d}"

In [None]:
def process_csv(file_path, l, with_period_id):
    """
    Process a CSV file to extract and tokenize data.

    Args:
        file_path (str): Path to the CSV file.
        l (int): Desired length of token arrays for the 'Tweet' column.
        with_period_id (bool): Whether to include the 'PeriodID' in the tweet text.

    Returns:
        pd.DataFrame: Processed DataFrame with columns 'PeriodID', 'EventType', and 'Tweet'.
    """
    # Load the tokenizer (default tokenizer from transformers)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Read the CSV file
    df = pd.read_csv(file_path)

    # Extract required columns
    df = df[['PeriodID', 'EventType', 'Tweet']]

    if with_period_id:
      # Preprocess text and concatenate with formatted PeriodID
        df['Tweet'] = df.apply(
            lambda row: f"{format_number_to_string(row['PeriodID'])} {preprocess_text(row['Tweet'])}",
            axis=1
        )
    else:
      # Apply preprocessing to each tweet
      df['Tweet'] = df['Tweet'].apply(preprocess_text)

    # Tokenize the 'Tweet' column and pad/truncate to length l
    def tokenize_tweet(tweet):
        tokens = tokenizer.encode(tweet, truncation=True, padding="max_length", max_length=l, add_special_tokens=True)
        return tokens

    df['Tweet'] = df['Tweet'].apply(tokenize_tweet)

    return df

In [None]:
path = 'ArgentinaBelgium72.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,11_0,11,0,0,1404575400000,RT @2014WorIdCup: Argentina vs Belgium\n\nWho ...
1,11_0,11,0,0,1404575400000,@elijahman_ time to focus on Belgium winning t...
2,11_0,11,0,0,1404575400000,RT @FIFAWorldCup: GLOBAL STADIUM: #Joinin with...
3,11_0,11,0,0,1404575400000,RT @CatholicNewsSvc: #PopeFrancis. Uh-oh. Arge...
4,11_0,11,0,0,1404575400000,RT @soccerdotcom: If he scores vs #BEL we'll a...


In [None]:
df_with_period_id = process_csv(path, 128, True)

In [None]:
df_with_period_id

Unnamed: 0,PeriodID,EventType,Tweet
0,0,0,"[101, 2199, 19387, 24185, 14615, 15569, 5619, ..."
1,0,0,"[101, 2199, 14063, 2386, 1035, 2051, 3579, 570..."
2,0,0,"[101, 2199, 19387, 5713, 11108, 15569, 3795, 3..."
3,0,0,"[101, 2199, 19387, 3234, 2638, 9333, 2015, 254..."
4,0,0,"[101, 2199, 19387, 4715, 27364, 9006, 3556, 10..."
...,...,...,...
313798,129,0,"[101, 14378, 19387, 15868, 1035, 4380, 2034, 2..."
313799,129,0,"[101, 14378, 19387, 3329, 2100, 1035, 1035, 10..."
313800,129,0,"[101, 14378, 2066, 5223, 2406, 2505, 13599, 56..."
313801,129,0,"[101, 14378, 19387, 21183, 8718, 5280, 2229, 2..."


In [None]:
df_without_period_id = process_csv(path, 128, False)

In [None]:
df_without_period_id

Unnamed: 0,PeriodID,EventType,Tweet
0,0,0,"[101, 19387, 24185, 14615, 15569, 5619, 1058, ..."
1,0,0,"[101, 14063, 2386, 1035, 2051, 3579, 5706, 304..."
2,0,0,"[101, 19387, 5713, 11108, 15569, 3795, 3346, 3..."
3,0,0,"[101, 19387, 3234, 2638, 9333, 2015, 25465, 48..."
4,0,0,"[101, 19387, 4715, 27364, 9006, 3556, 1058, 19..."
...,...,...,...
313798,129,0,"[101, 19387, 15868, 1035, 4380, 2034, 2051, 20..."
313799,129,0,"[101, 19387, 3329, 2100, 1035, 1035, 1035, 305..."
313800,129,0,"[101, 2066, 5223, 2406, 2505, 13599, 5619, 308..."
313801,129,0,"[101, 19387, 21183, 8718, 5280, 2229, 2401, 24..."
