In [4]:
import pandas as pd
import numpy as np
import pandas as pd
from main import get_data
(root, data_path, presidents, cities, countries, years, colors) = get_data()

In [5]:
def calculate_bigram_similarity(bigrams1, bigrams2):
    """Calculate bi-gram similarity using precomputed bigrams."""
    intersection = sum((bigrams1 & bigrams2).values())
    total = sum(bigrams1.values()) + sum(bigrams2.values()) - intersection
    return intersection / total if total else 0


In [6]:
from collections import Counter
def generate_word_bigrams(text):
    words = text.split()  # Split the text into words
    return [tuple(words[i:i+2]) for i in range(len(words) - 1)]  # Generate word bi-grams

def calculate_bigram_similarity(text1, text2, letter_bigrams=False):
    if letter_bigrams:
        # Generate bigrams for each string
        bigrams1 = [text1[i:i+2] for i in range(len(text1)-1)]
        bigrams2 = [text2[i:i+2] for i in range(len(text2)-1)]
    else: 
        # word bigrams
        # Generate word bi-grams for each string
        bigrams1 = generate_word_bigrams(text1)
        bigrams2 = generate_word_bigrams(text2)

    # Count bigrams
    bigrams1_count = Counter(bigrams1)
    bigrams2_count = Counter(bigrams2)

    # Calculate intersection and total
    intersection = sum((bigrams1_count & bigrams2_count).values())
    total = sum(bigrams1_count.values()) + sum(bigrams2_count.values()) - intersection

    return intersection / total if total else 0

In [7]:
def preprocess_text(text):
    """Convert text to lowercase."""
    return text.lower()

def filter_spam_tweets(data):
    non_spam_data = pd.DataFrame()
    users_to_drop = set()

    for user_id, group in data.groupby('item_number'):
        if user_id in users_to_drop:
            continue

        # Preprocess tweets to lowercase before comparison
        tweets = [preprocess_text(text) for text in group['text'].tolist()]
        spam_found = False

        for i in range(len(tweets)):
            for j in range(i + 1, len(tweets)):
                similarity = calculate_bigram_similarity(tweets[i], tweets[j], letter_bigrams=False)
                if similarity > 0.8:
                    users_to_drop.add(user_id)
                    spam_found = True
                    break
            if spam_found:
                break

        if not spam_found:
            non_spam_data = pd.concat([non_spam_data, group], ignore_index=True)

    return data[~data['item_number'].isin(users_to_drop)]


In [14]:
%%time
for president in presidents:
    data = pd.read_pickle(f'{data_path}{president}.pkl')
    data_filtered = filter_spam_tweets(data)
    data_filtered.to_pickle(f'{data_path}{president}-filtered.pkl')
    print(f"{president.capitalize()}: {data.shape[0]} -> {data_filtered.shape[0]} | Filtered: {data.shape[0] - data_filtered.shape[0]} Tweets")
    for location in countries + cities:
        data = pd.read_pickle(f'{data_path}{president}-{location}.pkl')
        data_filtered = filter_spam_tweets(data)
        data_filtered.to_pickle(f'{data_path}{president}-{location}-filtered.pkl')
        print(f"{president.capitalize()} - {location}: {data.shape[0]} -> {data_filtered.shape[0]} | Filtered: {data.shape[0] - data_filtered.shape[0]} Tweets")