In [1]:
import pandas as pd
import os
from config import presidents, cities, countries
data_path = os.environ['DATAPATH']

### Bigrams

In [5]:
def calculate_bigram_similarity(bigrams1, bigrams2):
    """Calculate bi-gram similarity using precomputed bigrams."""
    intersection = sum((bigrams1 & bigrams2).values())
    total = sum(bigrams1.values()) + sum(bigrams2.values()) - intersection
    return intersection / total if total else 0


In [6]:
from collections import Counter
def generate_word_bigrams(text):
    words = text.split()  # Split the text into words
    return [tuple(words[i:i+2]) for i in range(len(words) - 1)]  # Generate word bi-grams

def calculate_bigram_similarity(text1, text2, letter_bigrams=False):
    if letter_bigrams:
        # Generate bigrams for each string
        bigrams1 = [text1[i:i+2] for i in range(len(text1)-1)]
        bigrams2 = [text2[i:i+2] for i in range(len(text2)-1)]
    else: 
        # word bigrams
        # Generate word bi-grams for each string
        bigrams1 = generate_word_bigrams(text1)
        bigrams2 = generate_word_bigrams(text2)

    # Count bigrams
    bigrams1_count = Counter(bigrams1)
    bigrams2_count = Counter(bigrams2)

    # Calculate intersection and total
    intersection = sum((bigrams1_count & bigrams2_count).values())
    total = sum(bigrams1_count.values()) + sum(bigrams2_count.values()) - intersection

    return intersection / total if total else 0

In [7]:
def preprocess_text(text):
    """Convert text to lowercase."""
    return text.lower()

def filter_spam_tweets(data):
    non_spam_data = pd.DataFrame()
    users_to_drop = set()

    for user_id, group in data.groupby('item_number'):
        if user_id in users_to_drop:
            continue

        # Preprocess tweets to lowercase before comparison
        tweets = [preprocess_text(text) for text in group['text'].tolist()]
        spam_found = False

        for i in range(len(tweets)):
            for j in range(i + 1, len(tweets)):
                similarity = calculate_bigram_similarity(tweets[i], tweets[j], letter_bigrams=False)
                if similarity > 0.8:
                    users_to_drop.add(user_id)
                    spam_found = True
                    break
            if spam_found:
                break

        if not spam_found:
            non_spam_data = pd.concat([non_spam_data, group], ignore_index=True)

    return data[~data['item_number'].isin(users_to_drop)]


Exectuion was manually stopped after 689m 39.8s because runtime was too long.

In [None]:
for president in presidents:
    data = pd.read_pickle(f'{data_path}{president}.pkl')
    data_filtered = filter_spam_tweets(data)
    data_filtered.to_pickle(f'{data_path}{president}-filtered.pkl')
    print(f"{president.capitalize()}: {data.shape[0]} -> {data_filtered.shape[0]} | Filtered: {data.shape[0] - data_filtered.shape[0]} Tweets")
    for location in countries + cities:
        data = pd.read_pickle(f'{data_path}{president}-{location}.pkl')
        data_filtered = filter_spam_tweets(data)
        data_filtered.to_pickle(f'{data_path}{president}-{location}-filtered.pkl')
        print(f"{president.capitalize()} - {location}: {data.shape[0]} -> {data_filtered.shape[0]} | Filtered: {data.shape[0] - data_filtered.shape[0]} Tweets")

###  User Tweet Count

In [2]:
def user_tweet_counts(president):
    data = pd.read_pickle(f'{data_path}{president}.pkl')
    tweet_counts = data.groupby('user_id').size().reset_index(name='tweet_count')
    tweet_counts.sort_values('tweet_count', ascending=False, inplace=True)

    return tweet_counts

In [3]:
trump_user_tweet_counts = user_tweet_counts(presidents[0])
johnson_user_tweet_counts = user_tweet_counts(presidents[1])

In [10]:
trump_user_tweet_counts[trump_user_tweet_counts['tweet_count']>10]

Unnamed: 0,item_number,tweet_count
184303,226628,11


In [None]:
import matplotlib.pyplot as plt
def visualize_high_frequency_tweeters(tweet_counts):
    # Visualization
    plt.figure(figsize=(10, 6))
    plt.bar(tweet_counts['item_number'].astype(str), tweet_counts['tweet_count'], color='skyblue')
    plt.xlabel('User ID')
    plt.ylabel('Tweet Count')
    plt.title('Tweet Counts per User')
    plt.xticks(rotation=90, fontsize=8)  # Rotate x-axis labels for better readability
    plt.tight_layout()  # Adjust layout to make room for the rotated x-axis labels
    plt.show()

### High Frequency of Posts in a Day

In [None]:
def users_with_high_activity(president):
    data = pd.read_pickle(f'{data_path}{president}.pkl')
    # Ensure 'date' is in datetime format and 'item_number' represents user ID
    data['date'] = pd.to_datetime(data['date']).dt.date
    high_activity_users = data.groupby(['item_number', 'date']).filter(lambda x: len(x) > 10)
    return high_activity_users['item_number'].unique()

In [None]:
# load users with high activity
trump_high_activity_users = users_with_high_activity(presidents[0])
johnson_high_activity_users = users_with_high_activity(presidents[1])

In [None]:
def remove_high_activity_users_tweets(president):
    data = pd.read_pickle(f'{data_path}{president}.pkl')
    # Ensure 'date' is in datetime format
    data['date'] = pd.to_datetime(data['date']).dt.date
    # Identify users with more than 10 tweets in any single day
    high_activity_users = data.groupby(['item_number', 'date']).filter(lambda x: len(x) > 10)['item_number'].unique()
    # Filter out tweets from high activity users
    return data[~data['item_number'].isin(high_activity_users)]

In [None]:
# filter data
trump_filtered = remove_high_activity_users_tweets(presidents[0])
johnson_filtered = remove_high_activity_users_tweets(presidents[1])
# save data
trump_filtered.to_pickle(f'{data_path}trump.pkl')
johnson_filtered.to_pickle(f'{data_path}johnson.pkl')