## Preprocessing

In [31]:
import pandas as pd
import re, pathlib, string
from datetime import datetime

In [32]:
def segment_hashtag(hashtag):
    if hashtag.startswith('#'):
        hashtag = hashtag[1:]
    
    words = []
    current_word = hashtag[0].lower()
    
    for char in hashtag[1:]:
        if char.isupper():
            words.append(current_word)
            current_word = char.lower()
        else:
            current_word += char.lower()
    
    words.append(current_word)
    return words

def segment_hashtags_in_tweet(tweet):
    def replace_hashtag(match):
        hashtag = match.group(0)
        segmented = segment_hashtag(hashtag)
        return ' '.join(segmented)
    
    segmented_tweet = re.sub(r'#\w+', replace_hashtag, tweet)
    
    return segmented_tweet

In [33]:
def remove_offset(timestamp: str) -> str:
    # Convert to datetime object and remove the offset
    dt = datetime.fromisoformat(timestamp).replace(tzinfo=None)
    
    # Convert back to string without offset
    return dt.strftime("%Y-%m-%d %H:%M:%S")

def convert_tweet_date(date_string):
    date_string = date_string.strip()
    
    # Try different format strings
    formats = [
        "%I:%M:%S %p · %b %d, %Y",
        "%I:%M %p · %b %d, %Y",
        "%I:%M:%S %p · %b %d, %Y",
        "%I:%M %p · %B %d, %Y",  # Full month name
        "%H:%M:%S · %b %d, %Y",  # 24-hour format
        "%H:%M · %b %d, %Y",     # 24-hour format without seconds
    ]
    
    for fmt in formats:
        try:
            dt = datetime.strptime(date_string, fmt)
            print(f"Successful format: {fmt}")
            return dt
        except ValueError as e:
            print(f"Failed format: {fmt}")
            print(f"Error: {e}")
    
    raise ValueError(f"Unable to parse date string: {date_string}")


In [34]:
def clean_tweet(tweet): # If that fails, try parsing without seconds
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')

    tweet = re.sub(r'@\w+', '', tweet)
    
    emoticon_pattern = r'[:;=][oO\-]?[D\)\]\(\]/\\OpP]'
    tweet = re.sub(emoticon_pattern, '', tweet)
    
    tweet = segment_hashtags_in_tweet(tweet)

    return tweet

In [35]:
def trim_spaces_in_middle(text):
    trimmed_text = ' '.join(text.split())
    return trimmed_text

In [36]:

  
language = input('Language: ')
if language == 'en':
    dataset_path_input = "../dataset/english_gold_standard.csv"
    dataset_path_output = "../dataset/cleaned_english_tweets.csv"
elif language == 'it':
    dataset_path_input =  "../dataset/italian_gold_standard.csv"
    dataset_path_output = "../dataset/cleaned_italian_tweets.csv"
else:
    print("Language doesn't recognize")


df = pd.read_csv(dataset_path_input, sep=';')
df['cleaned_tweets'] = df['text'].apply(clean_tweet)
df['cleaned_tweets'] = df['cleaned_tweets'].apply(trim_spaces_in_middle)

    
df.drop(['text','Unnamed: 0'], axis=1, inplace=True)
if language == 'it':
    df['date'] = df['date'].apply(convert_tweet_date)c
elif language == 'en':
    df['date'] = df['date'].apply(remove_offset)
    
        
df.to_csv(dataset_path_output, sep=';')

Failed format: %I:%M:%S %p · %b %d, %Y
Error: time data '7:49 PM · Sep 9, 2024' does not match format '%I:%M:%S %p · %b %d, %Y'
Successful format: %I:%M %p · %b %d, %Y
Failed format: %I:%M:%S %p · %b %d, %Y
Error: time data '11:47 AM · Sep 2, 2024' does not match format '%I:%M:%S %p · %b %d, %Y'
Successful format: %I:%M %p · %b %d, %Y
Failed format: %I:%M:%S %p · %b %d, %Y
Error: time data '11:03 AM · Sep 3, 2024' does not match format '%I:%M:%S %p · %b %d, %Y'
Successful format: %I:%M %p · %b %d, %Y
Failed format: %I:%M:%S %p · %b %d, %Y
Error: time data '9:12 AM · Sep 10, 2024' does not match format '%I:%M:%S %p · %b %d, %Y'
Successful format: %I:%M %p · %b %d, %Y
Failed format: %I:%M:%S %p · %b %d, %Y
Error: time data '3:10 PM · Sep 1, 2024' does not match format '%I:%M:%S %p · %b %d, %Y'
Successful format: %I:%M %p · %b %d, %Y
Failed format: %I:%M:%S %p · %b %d, %Y
Error: time data '11:24 AM · Sep 3, 2024' does not match format '%I:%M:%S %p · %b %d, %Y'
Successful format: %I:%M %p 