# Notebook to organize Twitter data from most followed people (Kaggle)

## For testing authorship verification

In [1]:
!pip install kagglehub



In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mmmarchetti/tweets-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mmmarchetti/tweets-dataset?dataset_version_number=1...


100%|██████████| 2.82M/2.82M [00:01<00:00, 1.78MB/s]

Extracting files...
Path to dataset files: /Users/joaopedropadua/.cache/kagglehub/datasets/mmmarchetti/tweets-dataset/versions/1





In [3]:
# Let's examine the contents of the downloaded dataset
import os
import pandas as pd

# List the files in the downloaded directory
print("Files in the dataset:")
for file in os.listdir(path):
    print(f"- {file}")



Files in the dataset:
- tweets.csv


In [4]:
# If there's a CSV file, let's read it
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
if csv_files:
    print("\nReading the first CSV file:")
    df = pd.read_csv(os.path.join(path, csv_files[0]))
    print("\nFirst few rows of the dataset:")
    display(df.head())
    print("\nDataset shape:", df.shape)


Reading the first CSV file:

First few rows of the dataset:


Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.19633e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.19101e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.19014e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.19012e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.18689e+17,en,,,17620,4655



Dataset shape: (52542, 10)


In [5]:
# Group the data by author and count the number of tweets per author
author_counts = df.groupby('author').size().sort_values(ascending=False)

# Display the number of tweets per author
print("Number of tweets per author:")
display(author_counts)

# Get basic statistics for each author
author_stats = df.groupby('author').agg({
    'number_of_likes': ['mean', 'median', 'max'],
    'number_of_shares': ['mean', 'median', 'max']
})

print("\nStatistics per author:")
display(author_stats)

# Count the total number of unique authors
print(f"\nTotal number of unique authors: {len(author_counts)}")


Number of tweets per author:


author
TheEllenShow     3147
jimmyfallon      3123
ArianaGrande     3104
YouTube          3077
KimKardashian    2939
katyperry        2924
selenagomez      2913
rihanna          2877
BarackObama      2863
britneyspears    2776
instagram        2577
shakira          2530
Cristiano        2507
jtimberlake      2478
ladygaga         2329
Twitter          2290
ddlovato         2217
taylorswift13    2029
justinbieber     2000
cnnbrk           1842
dtype: int64


Statistics per author:


Unnamed: 0_level_0,number_of_likes,number_of_likes,number_of_likes,number_of_shares,number_of_shares,number_of_shares
Unnamed: 0_level_1,mean,median,max,mean,median,max
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ArianaGrande,25722.893363,21462.5,328838,11557.259665,9183.0,152045
BarackObama,3489.64897,1830.0,316831,1732.78065,795.0,200747
Cristiano,6978.824093,1475.0,198375,3726.740726,1546.0,112738
KimKardashian,11067.93263,5551.0,377769,3172.130316,966.0,219062
TheEllenShow,7004.186845,4216.0,149534,1830.59517,743.0,114404
Twitter,1005.248472,356.5,184768,699.413537,257.0,114311
YouTube,1271.899253,1028.0,51945,283.280143,166.0,28430
britneyspears,3053.735591,1295.5,46706,1709.209294,1131.5,29983
cnnbrk,807.735071,498.0,19638,624.132465,428.0,14779
ddlovato,17461.692377,15174.0,109032,10068.756879,8681.0,84421



Total number of unique authors: 20


In [6]:
# Filter the dataframe to include only tweets by Katy Perry and Rihanna
katy_rihanna_df = df[df['author'].isin(['katyperry', 'rihanna'])]

# Display the shape of the new dataframe
print(f"Shape of the Katy Perry and Rihanna dataframe: {katy_rihanna_df.shape}")

# Display the first few rows of the new dataframe
print("\nFirst few rows of the Katy Perry and Rihanna dataframe:")
display(katy_rihanna_df.head())

# Count the number of tweets for each author in the new dataframe
print("\nNumber of tweets per author in the filtered dataframe:")
display(katy_rihanna_df['author'].value_counts())


Shape of the Katy Perry and Rihanna dataframe: (5801, 10)

First few rows of the Katy Perry and Rihanna dataframe:


Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.19633e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.19101e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.19014e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.19012e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.18689e+17,en,,,17620,4655



Number of tweets per author in the filtered dataframe:


author
katyperry    2924
rihanna      2877
Name: count, dtype: int64

In [8]:
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Create empty dataframes for training and test sets
train_df = pd.DataFrame()
test_df = pd.DataFrame()

# Get unique authors
unique_authors = katy_rihanna_df['author'].unique()

# For each author, split their tweets into training (80%) and test (20%) sets
for author in unique_authors:
    # Get all tweets by this author
    author_tweets = katy_rihanna_df[katy_rihanna_df['author'] == author]
    
    # Shuffle the tweets
    author_tweets = author_tweets.sample(frac=1)
    
    # Calculate the split index
    split_idx = int(len(author_tweets) * 0.8)
    
    # Split into training and test sets
    author_train = author_tweets.iloc[:split_idx]
    author_test = author_tweets.iloc[split_idx:]
    
    # Add to the overall training and test dataframes
    train_df = pd.concat([train_df, author_train])
    test_df = pd.concat([test_df, author_test])

# Shuffle the training and test sets
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

# Display the shapes of the training and test sets
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Verify the distribution of authors in both sets
print("\nAuthor distribution in training set:")
display(train_df['author'].value_counts())

print("\nAuthor distribution in test set:")
display(test_df['author'].value_counts())



Training set shape: (4640, 10)
Test set shape: (1161, 10)

Author distribution in training set:


author
katyperry    2339
rihanna      2301
Name: count, dtype: int64


Author distribution in test set:


author
katyperry    585
rihanna      576
Name: count, dtype: int64

In [14]:
# Check if sum of training and test sets are equal to the whole number of tweets
total_counts = train_df['author'].value_counts() + test_df['author'].value_counts()
print("Total number of tweets in the data set:")
display(total_counts)

print("\nIs the number of tweets in the data set after splitting equal to before splitting?")
display(total_counts == katy_rihanna_df['author'].value_counts())

Total number of tweets in the data set:


author
katyperry    2924
rihanna      2877
Name: count, dtype: int64


Is the number of tweets in the data set after splitting equal to before splitting?


author
katyperry    True
rihanna      True
Name: count, dtype: bool

In [15]:
# Create a directory for the data if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save the dataframes to pickle files
train_df.to_pickle('data/twitter_train.pkl')
test_df.to_pickle('data/twitter_test.pkl')

print("Saved training set to data/twitter_train.pkl")
print("Saved test set to data/twitter_test.pkl")


Saved training set to data/twitter_train.pkl
Saved test set to data/twitter_test.pkl
