# Notebook to organize training data for analysis authorship of tweets

In [27]:
import pickle
import pandas as pd

In [None]:
# load data from pickle files

with open('./data/twitter_train.pkl', 'rb') as f:
    twitter_train = pickle.load(f)


In [28]:
# Create the two large concatenated train sets

marker = "\n---TWEET---\n"
author_texts = {}

for author in twitter_train['author'].unique():
    tweets = twitter_train[twitter_train['author'] == author]['content'].astype(str)
    author_texts[author] = marker.join(tweets)

In [15]:
# Check the first few tweets for each author
for author, text in list(author_texts.items())[:5]:
    print(f"Author: {author}")
    print(text[:200]) 
    print("\n")



Author: rihanna
“@RihrihFreak: I knew Rih would RT that..didn't she get slap for doing that to her late gran? Lol” first thing that popped in my head bruh
---TWEET---
Bitchez be like...."#nomakeup " http://t.co/6juBO


Author: katyperry
Amazon prime and slime or nah?
---TWEET---
@iammelissalynn happy birthday qewtie!!! 😘😘😘
---TWEET---
U also made Kitty Purry happy. “@essesfadaperry: @katyperry Ready! downloaded! I am now happy and ro




In [16]:
# Check the number of tokens in the training set
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")

for author, text in author_texts.items():
    tokens = encoding.encode(text)
    print(f"Author: {author}, Number of tokens: {len(tokens)}")


Author: rihanna, Number of tokens: 69587
Author: katyperry, Number of tokens: 75595


In [29]:
# Check the number of tweets in the training dataset
tweet_counts = twitter_train['author'].value_counts()
print(tweet_counts)


author
katyperry    2339
rihanna      2301
Name: count, dtype: int64


In [30]:
# Create a sample of the training dataset to use in the prompt

import numpy as np

marker = "\n---TWEET---\n"
sampled_author_texts = {}

for author in twitter_train['author'].unique():
    author_tweets = twitter_train[twitter_train['author'] == author]['content'].astype(str)
    n_total = len(author_tweets)
    n_sample = max(1, int(np.ceil(n_total / 3)))  # at least 1 tweet
    sampled_tweets = author_tweets.sample(n=n_sample, random_state=42)  # set random_state for reproducibility
    sampled_author_texts[author] = marker.join(sampled_tweets)


In [31]:
# Check the number of tokens in the sampled dataset
for author, text in sampled_author_texts.items():
    tokens = encoding.encode(text)
    print(f"Author: {author}, Number of tokens: {len(tokens)}")


Author: rihanna, Number of tokens: 23499
Author: katyperry, Number of tokens: 24853


In [32]:
# To save
with open('./data/author_texts.pkl', 'wb') as f:
    pickle.dump(author_texts, f)

with open('./data/sampled_author_texts.pkl', 'wb') as f:
    pickle.dump(sampled_author_texts, f)