In [None]:
# libraries for Twitter data collection
import pandas as pd
import tweepy
import time
from datetime import datetime, timedelta
import json
import os
import pickle
import sys

### 1st test


In [None]:
# Twitter API Configuration

BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN")
# API_KEY = os.getenv("TWITTER_API_KEY")
# API_SECRET = os.getenv("TWITTER_API_SECRET")
# ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN")
# ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET")


print("API credentials found:", bool(BEARER_TOKEN))

In [None]:
# Initialize Twitter API client
client = tweepy.Client(bearer_token=BEARER_TOKEN)
print("Twitter client created successfully")

In [None]:
# Use the same query that worked well
tweets_available_new = client.search_recent_tweets(
    query='Spotify "AI DJ" -#ai -#dj -is:retweet lang:en',
    max_results=97,
    tweet_fields=[
        "created_at",
        "public_metrics",
        "author_id",
        "lang",
        "context_annotations",
        "referenced_tweets",
    ],
)


In [None]:
len(tweets_available_new.data)


In [None]:
for i, tweet in enumerate(tweets_available_new.data[:5]):
    print(f"{i + 1}. {tweet.text[:100]}...")


### EDA


In [None]:
# Explore tweet object structure
first_tweet = tweets_available_new.data[0]
print("Tweet object type:", type(first_tweet))
print("Available attributes:", dir(first_tweet))


In [None]:
# Check what data fields we have
print("Tweet ID:", first_tweet.id)
print("Created at:", first_tweet.created_at)
print("Text:", first_tweet.text[:160] + " [truncated]")
print("Author ID:", first_tweet.author_id)
print("Language:", first_tweet.lang)


In [None]:
# Check public metrics
print("Public metrics:", first_tweet.public_metrics)
print("Like count:", first_tweet.public_metrics["like_count"])
print("Retweet count:", first_tweet.public_metrics["retweet_count"])


In [None]:
# Check context annotations (if available)
if hasattr(first_tweet, "context_annotations") and first_tweet.context_annotations:
    print("Context annotations:", first_tweet.context_annotations[:2])
else:
    print("No context annotations available")


In [None]:
# Check data we collected
len(tweets_available_new.data)


In [None]:
# Convert tweets to simple structure
tweets_list_new = []
for tweet in tweets_available_new.data:
    tweets_list_new.append(
        {
            "id": tweet.id,
            "text": tweet.text,
            "author_id": tweet.author_id,
            "created_at": tweet.created_at,
            "likes": tweet.public_metrics["like_count"],
            "retweets": tweet.public_metrics["retweet_count"],
            "replies": tweet.public_metrics["reply_count"],
        }
    )


In [None]:
# Create DataFrame
df_new = pd.DataFrame(tweets_list_new)
df_new.shape


In [None]:
# Quick overview
df_new.head()


In [None]:
# Save to CSV
df_new.to_csv("data/extracted/spotify_ai_dj_new_tweets.csv", index=False)


## Pre-processing for Model


In [None]:
# Test reading the CSV back
tweets_df = pd.read_csv("data/extracted/spotify_ai_dj_new_tweets.csv")
tweets_df.shape


In [None]:
# Basic data exploration
print("Dataset shape:", tweets_df.shape)
print("\nColumns:", tweets_df.columns.tolist())
print("\nDate range:")
print("From:", tweets_df["created_at"].min())
print("To:", tweets_df["created_at"].max())


In [None]:
# Engagement metrics overview
tweets_df[["likes", "retweets", "replies"]].describe()


In [None]:
# Text length analysis
tweets_df["text_length"] = tweets_df["text"].str.len()
tweets_df["text_length"].describe()


In [None]:
# Most engaging tweets
top_tweets = tweets_df.nlargest(5, "likes")[["text", "likes", "retweets"]]
top_tweets


In [None]:
# Quick content check - sample tweets
tweets_df["text"].head(10).tolist()


In [None]:
# Check URLs in tweets
url_tweets = tweets_df[tweets_df["text"].str.contains("http", case=False, na=False)]
print(f"Tweets with URLs: {len(url_tweets)} out of {len(tweets_df)}")


In [None]:
# Sample tweets with URLs
url_tweets[["text", "likes"]].head(3)


In [None]:
# Function for cardiffnlp model preprocessing
def preprocess_for_sentiment(text):
    new_text = []
    for t in text.split(" "):
        t = "@user" if t.startswith("@") and len(t) > 1 else t
        t = "http" if t.startswith("http") else t
        new_text.append(t)
    return " ".join(new_text)


In [None]:
# Test preprocessing on sample tweets
def preprocess_for_sentiment_v2(text):
    new_text = []
    for t in text.split(" "):
        # Only mask @user unless it is @spotify (case-insensitive)
        if t.lower().startswith("@spotify") and len(t) > 1:
            new_text.append("@spotify")
        elif t.startswith("@") and len(t) > 1:
            new_text.append("@user")
        elif t.startswith("http"):
            new_text.append("http")
        else:
            new_text.append(t)
    return " ".join(new_text)


sample_tweets = tweets_df["text"].head(3).tolist()
for i, tweet in enumerate(sample_tweets):
    print(f"Original: {tweet[:80]}...")
    print(f"Processed: {preprocess_for_sentiment_v2(tweet)[:80]}...")
    print()


In [None]:
# Apply preprocessing to all tweets
tweets_df["text_clean"] = tweets_df["text"].apply(preprocess_for_sentiment_v2)


In [None]:
# Check preprocessing results
tweets_df[["text", "text_clean"]].head(3)


In [None]:
# Save clean data for sentiment analysis
tweets_df.to_csv("data/cleaned/spotify_ai_dj_new_tweets_clean.csv", index=False)
print("Saved clean dataset with", len(tweets_df), "tweets")


In [None]:
# Check one example to see full preprocessing
print("Original:")
print(repr(tweets_df.iloc[2]["text"]))
print("\nCleaned:")
print(repr(tweets_df.iloc[2]["text_clean"]))


In [None]:
tweets_df


In [None]:
old_tweets = pd.read_csv("data/cleaned/spotify_ai_dj_tweets_clean.csv")


In [None]:
# Fix column compatibility - add text_length to old_tweets if missing
if "text_length" not in old_tweets.columns:
    old_tweets["text_length"] = old_tweets["text"].str.len()

# Use pd.concat instead of deprecated append()
all_tweets = pd.concat([old_tweets, tweets_df], ignore_index=True)
print(f"Combined dataset shape: {all_tweets.shape}")
print(f"Old tweets: {len(old_tweets)}, New tweets: {len(tweets_df)}")
all_tweets.head()


In [None]:
all_tweets.sort_values(by="created_at", inplace=True)


In [None]:
all_tweets.to_csv("data/spotify_ai_dj_all_tweets.csv", index=False)
