# Blog Authorship Corpus Exercise

This notebook work with the Blog Authorship Corpus dataset from Kaggle to experiment with LLMs and their potential for authorship attribution.

In [1]:
!pip install kaggle



In [2]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl.metadata (31 kB)
Collecting pyyaml (from kagglehub)
  Downloading PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (2.1 kB)
Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
Downloading PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl (183 kB)
Installing collected packages: pyyaml, kagglehub
Successfully installed kagglehub-0.3.10 pyyaml-6.0.2


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rtatman/blog-authorship-corpus")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/joaopedropadua/.cache/kagglehub/datasets/rtatman/blog-authorship-corpus/versions/2


In [3]:
# Let's examine the contents of the downloaded dataset
import os
import pandas as pd

# List the files in the downloaded directory
print("Files in the dataset:")
for file in os.listdir(path):
    print(f"- {file}")



Files in the dataset:
- blogtext.csv


In [4]:
# If there's a CSV file, let's read it
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
if csv_files:
    print("\nReading the first CSV file:")
    df = pd.read_csv(os.path.join(path, csv_files[0]))
    print("\nFirst few rows of the dataset:")
    display(df.head())
    print("\nDataset shape:", df.shape)


Reading the first CSV file:

First few rows of the dataset:


Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...



Dataset shape: (681284, 7)


In [5]:
# Group the data by id  and count the number of texts per author
author_counts = df.groupby('id').size().reset_index(name='text_count')

# Sort the authors by the number of texts in descending order
author_counts = author_counts.sort_values(by='text_count', ascending=False)



In [6]:
author_counts.head(10)

Unnamed: 0,id,text_count
180,449628,4221
410,734562,2301
280,589736,2294
1872,1975546,2261
635,958176,2244
784,1107146,2237
145,303162,2114
618,942828,2068
971,1270648,1951
1601,1784456,1843


In [7]:
#Retreive only the top 10 authors in a new dataframe
top_authors = author_counts.head(10)

df_top_authors = df.copy()

#Merge the top authors with the original dataframe
df_top_authors = pd.merge(df_top_authors, top_authors, on='id', how='inner')

df_top_authors.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,text_count
0,589736,male,35,Technology,Aries,"05,August,2004",Much funny. 2 points. As mentioned in the...,2294
1,589736,male,35,Technology,Aries,"05,August,2004","Harpers, Harpers, everywhere. Harpers, Har...",2294
2,589736,male,35,Technology,Aries,"05,August,2004","In an earlier post, Johnathan said: 'And ...",2294
3,589736,male,35,Technology,Aries,"05,August,2004","I'd post this on the RTG Blog, but I can't...",2294
4,589736,male,35,Technology,Aries,"05,August,2004",The answer to the first question lies with ...,2294


In [8]:
# Create a train set with two texts from each author
train_set = pd.DataFrame()

# For each author ID in our top authors
for author_id in top_authors['id']:
    # Get all texts from this author
    author_texts = df_top_authors[df_top_authors['id'] == author_id]
    
    # Randomly select 2 texts from this author
    # Using sample with random_state for reproducibility
    author_sample = author_texts.sample(n=2, random_state=42)
    
    # Add these texts to our train set
    train_set = pd.concat([train_set, author_sample])

# Reset the index of the train set
train_set = train_set.reset_index(drop=True)

# Display the first few rows of the train set
print("Train set shape:", train_set.shape)
print("\nSample of the train set (first few rows):")
display(train_set.head())

# Create a test set with the remaining texts
test_set = df_top_authors[~df_top_authors.index.isin(train_set.index)]
test_set = test_set.reset_index(drop=True)

print("\nTest set shape:", test_set.shape)




Train set shape: (20, 8)

Sample of the train set (first few rows):


Unnamed: 0,id,gender,age,topic,sign,date,text,text_count
0,449628,male,34,indUnk,Aries,"05,June,2003",urlLink A Day in the Country 20...,4221
1,449628,male,34,indUnk,Aries,"20,February,2003",urlLink DE Japan : Resources : Career...,4221
2,734562,female,24,Arts,Libra,"03,August,2004",You ain't fat! You ain't nothin'! ...,2301
3,734562,female,24,Arts,Libra,"03,August,2004",so no one was amused by the old JLS...,2301
4,589736,male,35,Technology,Aries,"05,August,2004",i'm sorry that i didn't let the gro...,2294



Test set shape: (23514, 8)


In [9]:
# Save as pickle files for preserving data types
train_set.to_pickle('train_set.pkl')
test_set.to_pickle('test_set.pkl')

print("Train set also saved to 'train_set.pkl'")
print("Test set also saved to 'test_set.pkl'")


Train set also saved to 'train_set.pkl'
Test set also saved to 'test_set.pkl'
