# Analyzing authorship of tweets from Rihanna and Katy Perry using Claude

In [19]:
!pip install tqdm



In [1]:
import os 
import sys
sys.path.append(os.path.abspath('..'))

In [3]:
import pickle
import pandas as pd
import numpy as np

In [2]:
from anthropic import Anthropic
from credentials import get_credentials_claude

API_KEY = get_credentials_claude()

client = Anthropic(api_key=API_KEY)


In [4]:
# Load data

# load training data
with open('./data/sampled_author_texts.pkl', 'rb') as f:
    training_data = pickle.load(f)

#load test data
with open('./data/twitter_test.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [14]:
# Get a random tweet from the test set
random_tweet_text = test_data['content'].sample(n=1, random_state=42).values[0]
random_tweet_author = test_data.loc[test_data['content'] == random_tweet_text, 'author'].values[0]
print(random_tweet_text)
print(f"Author: {random_tweet_author}")


It's just beyond the vault. Discover room 7 of the #ANTIdiaRy at https://t.co/uJ0CLlhoaS https://t.co/povYOBn3Fm
Author: rihanna


In [12]:
# Create the template prompt

prompt = f"""
You are an expert in linguistic analysis and authorship attribution. Your task is to analyze authorship--i.e. idiolectal--markers that the \
texts reveal about their authors.


You will be provided with:
1. Training samples (tweets) from 2 different authors, Rihanna and Katy Perry
2. A new text whose author you need to identify between those two authors

Guideline for analysis:
- Just use state of the art techniques to attribute authorship

Rihanna's tweets:
{training_data['rihanna']}

Katy Perry's tweets:
{training_data['katyperry']}

New text to analyze:
{random_tweet_text}

Your answer should contain only the author's name and nothing else.

"""

In [10]:
# Create a completion function that will be used to query the model
def get_completion(prompt: str, max_tokens=5000) -> str:
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        )
    return response.content[0].text

In [13]:
first_prediction = get_completion(prompt)
print(first_prediction)

Rihanna


In [15]:
# Check if the prediction is accurate
# Get the actual author of the random tweet
actual_author = random_tweet_author.lower()

# Get the predicted author from the model's response
predicted_author = first_prediction.strip().lower()

# Check if the prediction matches the actual author
is_correct = predicted_author == actual_author

print(f"Actual author: {actual_author}")
print(f"Predicted author: {predicted_author}")
print(f"Prediction is correct: {is_correct}")

# Calculate accuracy (1 for correct, 0 for incorrect)
accuracy = 1 if is_correct else 0
print(f"Accuracy: {accuracy}")


Actual author: rihanna
Predicted author: rihanna
Prediction is correct: True
Accuracy: 1


In [21]:
from tqdm.notebook import tqdm

sampled_tweets = test_data.sample(n=50, random_state=42).reset_index(drop=True)

predicted_authors = []
actual_authors = []
invalid_outputs = []

for idx, row in tqdm(sampled_tweets.iterrows(), total=len(sampled_tweets), desc="Processing tweets"):
    tweet_text = row['content']
    actual_author = row['author']
    
    prompt = f"""
You are an expert in linguistic analysis and authorship attribution. Your task is to analyze authorship--i.e. idiolectal--markers that the \\
texts reveal about their authors.

You will be provided with:
1. Training samples (tweets) from 2 different authors, Rihanna and Katy Perry
2. A new text whose author you need to identify between those two authors

Guideline for analysis:
- Just use state of the art techniques to attribute authorship

Rihanna's tweets:
{training_data['rihanna']}

Katy Perry's tweets:
{training_data['katyperry']}

New text to analyze:
{tweet_text}

Your answer should contain only the author's name and nothing else.

"""
    prediction = get_completion(prompt).strip().lower()
    if prediction not in ['rihanna', 'katyperry']:
        invalid_outputs.append((idx, prediction, tweet_text))
    predicted_authors.append(prediction)
    actual_authors.append(actual_author.lower())

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [None]:
# Calculate accuracy (only for valid outputs)
valid_indices = [i for i, pred in enumerate(predicted_authors) if pred in ['rihanna', 'katyperry']]
valid_preds = [predicted_authors[i] for i in valid_indices]
valid_actuals = [actual_authors[i] for i in valid_indices]

correct = sum(p == a for p, a in zip(valid_preds, valid_actuals))
total = len(valid_actuals)
accuracy = correct / total if total > 0 else 0

print(f"Accuracy (on valid outputs): {accuracy:.2%} ({correct}/{total})")
print(f"Number of invalid outputs: {len(invalid_outputs)}")

# Optionally, print invalid outputs
if invalid_outputs:
    print("\nInvalid outputs (index, prediction, tweet):")
    for idx, pred, tweet in invalid_outputs:
        print(f"Index {idx}: Prediction='{pred}' | Tweet='{tweet}'")