## Blog Authorship Corpus Exercise (Part 4)

This notebook will use the Kaggle blog dataset prepared in other notebook to assess whether Gemini can identify authorship

In [1]:
!pip install -q -U google-generativeai

In [1]:
import os
from credentials import get_credentials_gemini
import google.generativeai as genai
import pickle
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
API_KEY = get_credentials_gemini()
genai.configure(api_key=API_KEY)

In [3]:
def get_completion(prompt: str, model: str = "gemini-2.0-flash") -> str:
    '''
    Generate a completion for a given prompt using the Gemini API
    Args:
        prompt: The prompt to generate a completion for
        model: The model to use for the completion
    Returns:
        The completion for the prompt
    '''
    model = genai.GenerativeModel(model)
    response = model.generate_content(prompt)
    return response.text

In [4]:
#Load the data
with open('train_set.pkl', 'rb') as f:
    train_set = pickle.load(f)

with open('test_set.pkl', 'rb') as f:
    test_set = pickle.load(f)



In [5]:
test_set.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,text_count
0,589736,male,35,Technology,Aries,"05,August,2004","dude, i have no idea where that notebook is...",2294
1,589736,male,35,Technology,Aries,"05,August,2004","[ Mon Oct 06, 12:11:48 PM | hot chocolate ]...",2294
2,589736,male,35,Technology,Aries,"05,August,2004",Perhaps you can make notes to yourself duri...,2294
3,589736,male,35,Technology,Aries,"05,August,2004",very interesting but hard to keep in head a...,2294
4,589736,male,35,Technology,Aries,"05,August,2004",Small Picture - murder rap - lack of ide...,2294


In [6]:
# Get pairs or ids and texts to use them as variables in the prompt
train_pairs = train_set[['id', 'text']].to_dict(orient='records')

#get a list of author ids
author_ids = train_set['id'].unique()
len(author_ids)


10

In [7]:
# First select a random row from test_set
random_row = test_set.sample(n=1, random_state=42)  # random_state for reproducibility
random_text = random_row['text'].iloc[0]
random_id = random_row['id'].iloc[0]  # keep this to check if the model's prediction is correct

In [8]:
#Improved version (with LLM help)

prompt = f"""
You are an expert in linguistic analysis and authorship attribution. Your task is to analyze authorship--i.e. idiolectal--markers that the \
texts reveal about their authors.


You will be provided with:
1. Training samples from 10 different authors (2 texts per author)
2. A new text whose author you need to identify

Guideline for analysis:
- Just use state of the art techniques to attribute authorship


Here are the training samples:

<author_samples>
<author_id>
{train_pairs[0]['id']}
</author_id>
<texts>
{train_pairs[0]['text']}
{train_pairs[1]['text']}
</text>

<author_id>
{train_pairs[2]['id']}
</author_id>
<texts>
{train_pairs[2]['text']}
{train_pairs[3]['text']}
</texts>

<author_id>
{train_pairs[4]['id']}
</author_id>
<texts>
{train_pairs[4]['text']}
{train_pairs[5]['text']}
</texts>

<author_id>
{train_pairs[6]['id']}
</author_id>
<texts>
{train_pairs[6]['text']}
{train_pairs[7]['text']}
</texts>

<author_id>
{train_pairs[8]['id']}
</author_id>
<texts>
{train_pairs[8]['text']}
{train_pairs[9]['text']}
</texts>

<author_id>
{train_pairs[10]['id']}
</author_id>
<texts>
{train_pairs[10]['text']}
{train_pairs[11]['text']}
</texts>

<author_id>
{train_pairs[12]['id']}
</author_id>
<texts>
{train_pairs[12]['text']}
{train_pairs[13]['text']}
</texts>

<author_id>
{train_pairs[14]['id']}
</author_id>
<texts>
{train_pairs[14]['text']}
{train_pairs[15]['text']}
</texts>

<author_id>
{train_pairs[16]['id']}
</author_id>
<texts>
{train_pairs[16]['text']}
{train_pairs[17]['text']}
</texts>

<author_id>
{train_pairs[18]['id']}
</author_id>
<texts>
{train_pairs[18]['text']}
{train_pairs[19]['text']}
</texts>
</author_samples>

Now, analyze this new text and identify which author it most likely belongs to.

<new_text>
{random_text}
</new_text>


As an answer, give only the predicted id, no other text.
"""

In [9]:
# Testing
first_prediction = get_completion(prompt)
print(f'The predicted author id is: {first_prediction}')
print('--------------------------------')

# Check if the model's predicted author id exsists
author_ids = train_set['id'].unique()

# Try to convert the prediction directly to integer
try:
    predicted_id = int(first_prediction.strip())
except ValueError:
    print(f"Invalid prediction format: {first_prediction}")
    predicted_id = None

if predicted_id in author_ids:
    print(f"The predicted author id {predicted_id} exists in the training set.")
else:
    print(f"The predicted author id {predicted_id} does not exist in the training set.")

print('--------------------------------')
# Assess the accuracy of the model's prediction

if predicted_id == random_id:
    print("The model's prediction is correct.")
else:
    print("The model's prediction is incorrect.")


The predicted author id is: 1784456

--------------------------------
The predicted author id 1784456 exists in the training set.
--------------------------------
The model's prediction is incorrect.


In [None]:
# Fixed the quota problem using Gemini 2.0 flash

# Create an experiment with 100 predictions

# Initialize an empty list to store all predictions
all_predictions = []

# Loop through 100 iterations
for i in range(100):
    try:
        # Select a random row from test_set
        random_row = test_set.sample(n=1, random_state=i)  # Using i as random_state for reproducibility
        random_text = random_row['text'].iloc[0]
        random_id = random_row['id'].iloc[0]
        
        # Get the prediction from the model
        prediction = get_completion(prompt)
        
        # Check if prediction is empty
        if not prediction:
            print(f"Iteration {i}: Empty prediction received")
            continue
            
        # Try to convert the prediction to integer
        try:
            predicted_id = int(prediction)
        except ValueError:
            print(f"Iteration {i}: Invalid prediction format: {prediction}")
            continue
            
        # Store the results
        all_predictions.append({
            'actual_id': random_id,
            'predicted_id': predicted_id,
            'is_correct': random_id == predicted_id
        })
        
        # Print progress every 10 iterations
        if (i + 1) % 10 == 0:
            print(f"Completed {i + 1} iterations")
            
    except Exception as e:
        print(f"Iteration {i}: Error occurred - {str(e)}")
        continue



Completed 10 iterations
Completed 20 iterations
Completed 30 iterations
Completed 40 iterations
Completed 50 iterations
Completed 60 iterations
Completed 70 iterations
Completed 80 iterations
Completed 90 iterations
Completed 100 iterations


In [11]:
# Check if we have any predictions
if not all_predictions:
    print("No valid predictions were collected")
else:
    # Convert the list of predictions to a DataFrame
    predictions_df = pd.DataFrame(all_predictions)
    
    # Calculate and print overall accuracy
    accuracy = predictions_df['is_correct'].mean()
    print(f"\nOverall accuracy: {accuracy:.2%}")
    print(f"Total predictions: {len(predictions_df)}")
    print(f"Correct predictions: {predictions_df['is_correct'].sum()}")


Overall accuracy: 8.00%
Total predictions: 100
Correct predictions: 8


## Expected results from random guessing:
**Guessing any author**: 10.00%

**Guessing the modal author**: 17.95%
