In [1]:
# libraries for Twitter data collection
import pandas as pd
import tweepy
import time
from datetime import datetime, timedelta
import json
import os
import pickle
import sys

### 1st test


In [2]:
# Twitter API Configuration

BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN")
# API_KEY = os.getenv("TWITTER_API_KEY")
# API_SECRET = os.getenv("TWITTER_API_SECRET")
# ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
# ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")


print("API credentials found:", bool(BEARER_TOKEN))

API credentials found: True


In [4]:
# Initialize Twitter API client
client = tweepy.Client(bearer_token=BEARER_TOKEN)
print("Twitter client created successfully")

Twitter client created successfully


In [5]:
# Use the same query that worked well
tweets_available_new = client.search_recent_tweets(
    query='Spotify "AI DJ" -#ai -#dj -is:retweet lang:en',
    max_results=97,
    tweet_fields=[
        "created_at",
        "public_metrics",
        "author_id",
        "lang",
        "context_annotations",
        "referenced_tweets",
    ],
)


In [6]:
len(tweets_available_new.data)


97

In [7]:
for i, tweet in enumerate(tweets_available_new.data[:5]):
    print(f"{i + 1}. {tweet.text[:100]}...")


1. @Spotify although it is in BETA your Ai DJ is F*#!&amp;ng awesome. Thank you. It's been a pleasure u...
2. We don’t talk more about @spotify AI DJ, it’s so good. https://t.co/YevBz2gur1...
3. @TomTalksCars have you hacked my spotify?!?!

The AI DJ chap just said "these songs will give you wi...
4. Spotify’s AI DJ now takes typed requests and speaks Spanish 

https://t.co/Z5cmyPePoa https://t.co/a...
5. @DiianaD_ @Omah_Lay according to Spotify Ai DJ 
 I’m a part of top 10% of your fans.. show love 💜...


### EDA


In [8]:
# Explore tweet object structure
first_tweet = tweets_available_new.data[0]
print("Tweet object type:", type(first_tweet))
print("Available attributes:", dir(first_tweet))


Tweet object type: <class 'tweepy.tweet.Tweet'>
Available attributes: ['__abstractmethods__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '_abc_impl', 'attachments', 'author_id', 'context_annotations', 'conversation_id', 'created_at', 'data', 'edit_controls', 'edit_history_tweet_ids', 'entities', 'geo', 'get', 'id', 'in_reply_to_user_id', 'items', 'keys', 'lang', 'non_public_metrics', 'organic_metrics', 'possibly_sensitive', 'promoted_metrics', 'public_metrics', 'referenced_tweets', 'reply_settings', 'source', 'text', 'values', 'withheld']


In [9]:
# Check what data fields we have
print("Tweet ID:", first_tweet.id)
print("Created at:", first_tweet.created_at)
print("Text:", first_tweet.text[:160] + " [truncated]")
print("Author ID:", first_tweet.author_id)
print("Language:", first_tweet.lang)


Tweet ID: 1979608101427773832
Created at: 2025-10-18 17:58:45+00:00
Text: @Spotify although it is in BETA your Ai DJ is F*#!&amp;ng awesome. Thank you. It's been a pleasure using it. So much better then Smart Shuffle imo [truncated]
Author ID: 1172125489206611968
Language: en


In [10]:
# Check public metrics
print("Public metrics:", first_tweet.public_metrics)
print("Like count:", first_tweet.public_metrics["like_count"])
print("Retweet count:", first_tweet.public_metrics["retweet_count"])


Public metrics: {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0, 'bookmark_count': 0, 'impression_count': 4}
Like count: 0
Retweet count: 0


In [11]:
# Check context annotations (if available)
if hasattr(first_tweet, "context_annotations") and first_tweet.context_annotations:
    print("Context annotations:", first_tweet.context_annotations[:2])
else:
    print("No context annotations available")


Context annotations: [{'domain': {'id': '45', 'name': 'Brand Vertical', 'description': 'Top level entities that describe a Brands industry'}, 'entity': {'id': '781974597310615553', 'name': 'Entertainment'}}, {'domain': {'id': '30', 'name': 'Entities [Entity Service]', 'description': 'Entity Service top level domain, every item that is in Entity Service should be in this domain'}, 'entity': {'id': '781974597222535168', 'name': 'Online Services - Entertainment'}}]


In [12]:
# Check data we collected
len(tweets_available_new.data)


97

In [13]:
# Convert tweets to simple structure
tweets_list_new = []
for tweet in tweets_available_new.data:
    tweets_list_new.append(
        {
            "id": tweet.id,
            "text": tweet.text,
            "author_id": tweet.author_id,
            "created_at": tweet.created_at,
            "likes": tweet.public_metrics["like_count"],
            "retweets": tweet.public_metrics["retweet_count"],
            "replies": tweet.public_metrics["reply_count"],
        }
    )


In [14]:
# Create DataFrame
df_new = pd.DataFrame(tweets_list_new)
df_new.shape


(97, 7)

In [15]:
# Quick overview
df_new.head()


Unnamed: 0,id,text,author_id,created_at,likes,retweets,replies
0,1979608101427773832,@Spotify although it is in BETA your Ai DJ is ...,1172125489206611968,2025-10-18 17:58:45+00:00,0,0,0
1,1979519452124426507,"We don’t talk more about @spotify AI DJ, it’s ...",1441064696967270437,2025-10-18 12:06:29+00:00,0,0,0
2,1979480353573089284,@TomTalksCars have you hacked my spotify?!?!\n...,380185975,2025-10-18 09:31:07+00:00,3,0,1
3,1979480074584764692,Spotify’s AI DJ now takes typed requests and s...,156568938,2025-10-18 09:30:01+00:00,0,0,0
4,1979407666834223276,@DiianaD_ @Omah_Lay according to Spotify Ai DJ...,890357228003356672,2025-10-18 04:42:18+00:00,0,0,0


In [None]:
# Save to CSV
df_new.to_csv("data/extracted/spotify_ai_dj_new_tweets.csv", index=False)


## Pre-processing for Model


In [None]:
# Test reading the CSV back
tweets_df = pd.read_csv("data/extracted/spotify_ai_dj_new_tweets.csv")
tweets_df.shape


(97, 7)

In [34]:
# Basic data exploration
print("Dataset shape:", tweets_df.shape)
print("\nColumns:", tweets_df.columns.tolist())
print("\nDate range:")
print("From:", tweets_df["created_at"].min())
print("To:", tweets_df["created_at"].max())


Dataset shape: (97, 7)

Columns: ['id', 'text', 'author_id', 'created_at', 'likes', 'retweets', 'replies']

Date range:
From: 2025-10-13 13:44:28+00:00
To: 2025-10-18 17:58:45+00:00


In [35]:
# Engagement metrics overview
tweets_df[["likes", "retweets", "replies"]].describe()


Unnamed: 0,likes,retweets,replies
count,97.0,97.0,97.0
mean,1.278351,0.195876,0.247423
std,3.944253,1.08627,0.59566
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,1.0,0.0,0.0
max,32.0,8.0,3.0


In [36]:
# Text length analysis
tweets_df["text_length"] = tweets_df["text"].str.len()
tweets_df["text_length"].describe()


count     97.000000
mean     150.072165
std       84.444909
min       20.000000
25%       79.000000
50%      128.000000
75%      218.000000
max      320.000000
Name: text_length, dtype: float64

In [37]:
# Most engaging tweets
top_tweets = tweets_df.nlargest(5, "likes")[["text", "likes", "retweets"]]
top_tweets


Unnamed: 0,text,likes,retweets
58,Premium members can now make requests in Spani...,32,8
67,I belong on my strange addiction for how much ...,17,0
95,YouTube Music is testing its answer to Spotify...,10,7
62,Spotify's AI DJ now takes text commands and ac...,9,1
27,I can't take Spotify's efforts here seriously ...,6,0


In [38]:
# Quick content check - sample tweets
tweets_df["text"].head(10).tolist()


["@Spotify although it is in BETA your Ai DJ is F*#!&amp;ng awesome. Thank you. It's been a pleasure using it. So much better then Smart Shuffle imo",
 'We don’t talk more about @spotify AI DJ, it’s so good. https://t.co/YevBz2gur1',
 '@TomTalksCars have you hacked my spotify?!?!\n\nThe AI DJ chap just said "these songs will give you wind in your hair vibes; of course, I don\'t know if you actually have hair"\n\nFML',
 'Spotify’s AI DJ now takes typed requests and speaks Spanish \n\nhttps://t.co/Z5cmyPePoa https://t.co/apJMbI8ILZ',
 '@DiianaD_ @Omah_Lay according to Spotify Ai DJ \n I’m a part of top 10% of your fans.. show love 💜',
 "I may unsubscribe from @Spotify, though it has taste truly superior to YouTube Music, because you can't turn the AI DJ voice off on the mix (which isn't a bad mix). Extremely annoying.",
 '👋 Hey folks, FooDriver here!\n🤖 Spotify is rocking the tech world with generative AI, enhancing user interaction with features like AI DJ and playlist creation based on

In [39]:
# Check URLs in tweets
url_tweets = tweets_df[tweets_df["text"].str.contains("http", case=False, na=False)]
print(f"Tweets with URLs: {len(url_tweets)} out of {len(tweets_df)}")


Tweets with URLs: 66 out of 97


In [40]:
# Sample tweets with URLs
url_tweets[["text", "likes"]].head(3)


Unnamed: 0,text,likes
1,"We don’t talk more about @spotify AI DJ, it’s ...",0
3,Spotify’s AI DJ now takes typed requests and s...,0
6,"👋 Hey folks, FooDriver here!\n🤖 Spotify is roc...",1


In [41]:
# Function for cardiffnlp model preprocessing
def preprocess_for_sentiment(text):
    new_text = []
    for t in text.split(" "):
        t = "@user" if t.startswith("@") and len(t) > 1 else t
        t = "http" if t.startswith("http") else t
        new_text.append(t)
    return " ".join(new_text)


In [None]:
# Test preprocessing on sample tweets
def preprocess_for_sentiment_v2(text):
    new_text = []
    for t in text.split(" "):
        # Only mask @user unless it is @spotify (case-insensitive)
        if t.lower().startswith("@spotify") and len(t) > 1:
            new_text.append("@spotify")
        elif t.startswith("@") and len(t) > 1:
            new_text.append("@user")
        elif t.startswith("http"):
            new_text.append("http")
        else:
            new_text.append(t)
    return " ".join(new_text)


sample_tweets = tweets_df["text"].head(3).tolist()
for i, tweet in enumerate(sample_tweets):
    print(f"Original: {tweet[:80]}...")
    print(f"Processed: {preprocess_for_sentiment_v2(tweet)[:80]}...")
    print()


Original: @Spotify although it is in BETA your Ai DJ is F*#!&amp;ng awesome. Thank you. It...
Processed: @spotify although it is in BETA your Ai DJ is F*#!&amp;ng awesome. Thank you. It...

Original: We don’t talk more about @spotify AI DJ, it’s so good. https://t.co/YevBz2gur1...
Processed: We don’t talk more about @spotify AI DJ, it’s so good. http...

Original: @TomTalksCars have you hacked my spotify?!?!

The AI DJ chap just said "these so...
Processed: @user have you hacked my spotify?!?!

The AI DJ chap just said "these songs will...



In [43]:
# Apply preprocessing to all tweets
tweets_df["text_clean"] = tweets_df["text"].apply(preprocess_for_sentiment_v2)


In [44]:
# Check preprocessing results
tweets_df[["text", "text_clean"]].head(3)


Unnamed: 0,text,text_clean
0,@Spotify although it is in BETA your Ai DJ is ...,@spotify although it is in BETA your Ai DJ is ...
1,"We don’t talk more about @spotify AI DJ, it’s ...","We don’t talk more about @spotify AI DJ, it’s ..."
2,@TomTalksCars have you hacked my spotify?!?!\n...,@user have you hacked my spotify?!?!\n\nThe AI...


In [None]:
# Save clean data for sentiment analysis
tweets_df.to_csv("data/cleaned/spotify_ai_dj_new_tweets_clean.csv", index=False)
print("Saved clean dataset with", len(tweets_df), "tweets")


Saved clean dataset with 97 tweets


In [46]:
# Check one example to see full preprocessing
print("Original:")
print(repr(tweets_df.iloc[2]["text"]))
print("\nCleaned:")
print(repr(tweets_df.iloc[2]["text_clean"]))


Original:
'@TomTalksCars have you hacked my spotify?!?!\n\nThe AI DJ chap just said "these songs will give you wind in your hair vibes; of course, I don\'t know if you actually have hair"\n\nFML'

Cleaned:
'@user have you hacked my spotify?!?!\n\nThe AI DJ chap just said "these songs will give you wind in your hair vibes; of course, I don\'t know if you actually have hair"\n\nFML'


In [47]:
tweets_df


Unnamed: 0,id,text,author_id,created_at,likes,retweets,replies,text_length,text_clean
0,1979608101427773832,@Spotify although it is in BETA your Ai DJ is ...,1172125489206611968,2025-10-18 17:58:45+00:00,0,0,0,146,@spotify although it is in BETA your Ai DJ is ...
1,1979519452124426507,"We don’t talk more about @spotify AI DJ, it’s ...",1441064696967270437,2025-10-18 12:06:29+00:00,0,0,0,78,"We don’t talk more about @spotify AI DJ, it’s ..."
2,1979480353573089284,@TomTalksCars have you hacked my spotify?!?!\n...,380185975,2025-10-18 09:31:07+00:00,3,0,1,178,@user have you hacked my spotify?!?!\n\nThe AI...
3,1979480074584764692,Spotify’s AI DJ now takes typed requests and s...,156568938,2025-10-18 09:30:01+00:00,0,0,0,109,Spotify’s AI DJ now takes typed requests and s...
4,1979407666834223276,@DiianaD_ @Omah_Lay according to Spotify Ai DJ...,890357228003356672,2025-10-18 04:42:18+00:00,0,0,0,97,@user @user according to Spotify Ai DJ \n I’m ...
...,...,...,...,...,...,...,...,...,...
92,1977851928999534748,who needs the spotify ai dj whej u could let m...,1365329219727802369,2025-10-13 21:40:21+00:00,4,0,2,63,who needs the spotify ai dj whej u could let m...
93,1977841187181433031,I have a fickle relationship with the Spotify ...,1976818041905115136,2025-10-13 20:57:40+00:00,0,0,0,52,I have a fickle relationship with the Spotify ...
94,1977759605934453059,YouTube Music is channeling Spotify’s AI DJ wi...,1688680889608265728,2025-10-13 15:33:29+00:00,0,0,0,146,YouTube Music is channeling Spotify’s AI DJ wi...
95,1977751212800737299,YouTube Music is testing its answer to Spotify...,35203319,2025-10-13 15:00:08+00:00,10,7,3,133,YouTube Music is testing its answer to Spotify...


In [None]:
old_tweets = pd.read_csv("data/cleaned/spotify_ai_dj_tweets_clean.csv")


In [54]:
# Fix column compatibility - add text_length to old_tweets if missing
if "text_length" not in old_tweets.columns:
    old_tweets["text_length"] = old_tweets["text"].str.len()

# Use pd.concat instead of deprecated append()
all_tweets = pd.concat([old_tweets, tweets_df], ignore_index=True)
print(f"Combined dataset shape: {all_tweets.shape}")
print(f"Old tweets: {len(old_tweets)}, New tweets: {len(tweets_df)}")
all_tweets.head()


Combined dataset shape: (144, 9)
Old tweets: 47, New tweets: 97


Unnamed: 0,id,text,author_id,created_at,likes,retweets,replies,text_clean,text_length
0,1963699578370220543,This playlist gives Spotify AI DJ💀💀,4913255457,2025-09-04 20:23:57+00:00,0,0,0,This playlist gives Spotify AI DJ💀💀,35
1,1963692075439849536,The Spotify AI DJ randomly played this today.....,60245577,2025-09-04 19:54:09+00:00,0,0,0,The Spotify AI DJ randomly played this today.....,126
2,1963646985556103628,@g0dbr34th3d Your playlist is amazing for real...,1495824184638906368,2025-09-04 16:54:58+00:00,0,0,0,@user Your playlist is amazing for real.\n\nWo...,317
3,1963624836846023034,Makes sense that this is why the Spotify AI DJ...,261881081,2025-09-04 15:26:58+00:00,6,1,1,Makes sense that this is why the Spotify AI DJ...,82
4,1963616655533093015,I’ve only just discovered Spotify AI Dj..,33827542,2025-09-04 14:54:27+00:00,1,0,0,I’ve only just discovered Spotify AI Dj..,41


In [57]:
all_tweets.sort_values(by="created_at", inplace=True)


In [58]:
all_tweets.to_csv("data/spotify_ai_dj_all_tweets.csv", index=False)
