In [1]:
import datetime
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
import json
import pandas as pd 
import spotipy 
import time
from pydantic import BaseModel, Field ,TypeAdapter
from typing import List, Optional, Dict
import os 

In [2]:
CLIENT_ID =  os.getenv("SPOTIFY_CLIENT_ID") # Replace with your Spotify Client ID
CLIENT_SECRET = os.getenv("SPOTIFY_CLIENT_SECRET") 

In [3]:
client_credentials_manager = SpotifyClientCredentials(
    client_id=CLIENT_ID, client_secret=CLIENT_SECRET
)
sp = Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
search_query = "Harry Potter"  # Example search query
search_response = sp.search(q=search_query, type='audiobook', limit=1)

In [5]:
year = "2023"


offset = 0
all_audiobooks=list()
limit=50
while offset < 1000:
    search_response = sp.search(
        q=f'year:{year}',
        type='audiobook',
        limit=limit,
        offset=offset
    )
    
    # Get the audiobooks from the search response
    audiobooks = search_response.get('audiobooks', {}).get('items', [])
    
    if not audiobooks:
        # If no more audiobooks are found, stop the loop
        break
    
    # Append the audiobooks to the list
    all_audiobooks.extend(audiobooks)
    
    # Increment the offset for the next request
    offset += limit
    
    # Sleep to prevent rate-limiting issues (if necessary)
    time.sleep(1)

# Now all_audiobooks contains all the audiobooks from the search
print(f"Total audiobooks fetched: {len(all_audiobooks)}")


Total audiobooks fetched: 1000


In [21]:
class Author(BaseModel):
    name: str
    
class Copyrights(BaseModel):
    text: Optional[str] = None 
    type: Optional[str] = None 


class External_URL(BaseModel):
    spotify: str
    
class Images(BaseModel):
    url: Optional[str] = None 
    width: Optional[int] = None
    height: Optional[int] = None
    
class Narrators(BaseModel):
    name: Optional[str] = None     
    
    

class Audiobook(BaseModel):
    id: str
    description: Optional[str] = None
    explicit: Optional[bool] = None
    authors: List[Author]
    available_markets: Optional[List[str]] = None
    copyrights: Optional[List[Copyrights]]
    html_description: Optional[str] = None
    edition: Optional[str] = None
    external_urls: Optional[External_URL]
    href: Optional[str] = None
    images: List[Images]
    languages: Optional[List[str]] = None
    media_type: Optional[str] = None       
    name: Optional[str] = None
    narrators: Optional[List[Narrators]]
    publisher: Optional[str] = None
    type: Optional[str] = None
    uri: Optional[str] = None
    total_chapters: Optional[int] = None
    
class FlattenedAuthorResponse(BaseModel):
    id: str
    description: Optional[str] = None
    explicit: Optional[bool] = None
    author_name: str  # Flattened author name into a separate field  
    available_market: Optional[str] = None 
    copyright_text: Optional[str] = None
    copyright_type: Optional[str] = None
    html_description: Optional[str] = None
    edition: Optional[str] = None
    spotify_external_url: Optional[str] = None
    href: Optional[str] = None
    image_url: Optional[str] = None
    image_height: Optional[int] = None
    image_width: Optional[int] = None
    languages: Optional[str] = None
    media_type: Optional[str] = None
    book_name: Optional[str] = None
    narrator_name: Optional[str] = None
    book_type: Optional[str] = None
    uri: Optional[str] = None
    total_chapters: Optional[int] = None
    
class FlattenedAuthorResponse(BaseModel):
    id: str
    description: Optional[str] = None
    explicit: Optional[bool] = None
    author_name: str  # Flattened author name into a separate field  
    available_market: Optional[str] = None 
    copyright_text: Optional[str] = None
    copyright_type: Optional[str] = None
    html_description: Optional[str] = None
    edition: Optional[str] = None
    spotify_external_url: Optional[str] = None
    href: Optional[str] = None
    image_url: Optional[str] = None
    image_height: Optional[int] = None
    image_width: Optional[int] = None
    languages: Optional[str] = None
    media_type: Optional[str] = None
    book_name: Optional[str] = None
    narrator_name: Optional[str] = None
    publisher: Optional[str] = None
    book_type: Optional[str] = None
    book_uri: Optional[str] = None
    total_chapters: Optional[int] = None    
    
class AudiobooksResponse(BaseModel):
    audiobooks: List[Audiobook]
    
    def flatten(self) -> List[FlattenedAuthorResponse]:
        """Flatten the authors into separate rows."""
        flattened_data = []
        flattened_data = [
            FlattenedAuthorResponse(
                id=audiobook.id,
                author_name=author.name,
                available_market=market,
                copyright_text=copyright.text,
                copyright_type=copyright.type,
                description=audiobook.description,
                html_description=audiobook.html_description,
                explicit=audiobook.explicit,
                edition=audiobook.edition,
                spotify_external_url=external_url[1],
                href=audiobook.href,
                image_url=images.url,
                image_width=images.width,
                image_height=images.height,
                languages=languages,
                media_type=audiobook.media_type,
                book_name=audiobook.name,
                narrator_name=narrator_name.name,
                publisher=audiobook.publisher,
                book_type= audiobook.type,
                book_uri= audiobook.uri,
                total_chapters= audiobook.total_chapters
            )
        for audiobook in self.audiobooks
        for copyright in audiobook.copyrights
        for market in audiobook.available_markets
        for author in audiobook.authors
        for external_url in audiobook.external_urls
        for images in audiobook.images
        for languages in audiobook.languages
        for narrator_name in audiobook.narrators
        ]
        return flattened_data 
    

In [22]:
audiobook_list_adapter = TypeAdapter(List[Audiobook])
validated_audiobook = audiobook_list_adapter.validate_python(all_audiobooks)


In [23]:
type(validated_audiobook)

list

In [24]:
validated_audiobook[0]

Audiobook(id='4dPFQsOtZwaKNjuaMOtZiD', description='Author(s): Asako Yuzuki\nNarrator(s): Hanako Footman\n\n<p><strong>WINNER OF WATERSTONES BOOK OF THE YEAR 2024 </strong></p><p><strong>A BBC \'BETWEEN THE COVERS\' BOOK CLUB PICK</strong></p><p><strong>\'Compelling, delightfully weird, often uncomfortable\' </strong>PANDORA SYKES</p><p><strong>\'Unputdownable, breathtakingly original\' </strong>ERIN KELLY</p><p><strong>\'I have been glued to Asako Yuzuki\'s new novel Butter</strong>’ NIGEL SLATER</p><p><strong>‘A full-fat, Michelin-starred treat’ </strong><em>THE SUNDAY TIMES</em></p><p><strong>The cult Japanese bestseller about a female gourmet cook and serial killer and the journalist intent on cracking her case, inspired by a true story, and translated by Polly Barton.</strong></p><p><em>There are two things that I can simply not tolerate: feminists and margarine.</em></p><p>Gourmet cook Manako Kajii sits in Tokyo Detention Centre convicted of the serial murders of lonely businessm

In [25]:
df_audiobooks_dense = pd.DataFrame([audiobook.model_dump() for audiobook in validated_audiobook])
df_audiobooks_dense.head(n=3)

Unnamed: 0,id,description,explicit,authors,available_markets,copyrights,html_description,edition,external_urls,href,images,languages,media_type,name,narrators,publisher,type,uri,total_chapters
0,4dPFQsOtZwaKNjuaMOtZiD,Author(s): Asako Yuzuki\nNarrator(s): Hanako F...,False,[{'name': 'Asako Yuzuki'}],"[AU, BE, FR, GB, IE, LU, NL, NZ]",[],Author(s): Asako Yuzuki<br/>Narrator(s): Hanak...,Unabridged,{'spotify': 'https://open.spotify.com/show/4dP...,https://api.spotify.com/v1/audiobooks/4dPFQsOt...,[{'url': 'https://i.scdn.co/image/ab6766630000...,[en],audio,Butter: WINNER OF WATERSTONES BOOK OF THE YEAR...,[{'name': 'Hanako Footman'}],Asako Yuzuki,audiobook,spotify:show:4dPFQsOtZwaKNjuaMOtZiD,
1,4ez5NQopq7PCIFjdtCtLPu,Author(s): Rebecca Yarros\nNarrator(s): Rebecc...,False,[{'name': 'Rebecca Yarros'}],"[AU, CA, GB, IE, NZ, PR, US]","[{'text': 'Recorded Books 2023', 'type': 'C'}]",Author(s): Rebecca Yarros<br/>Narrator(s): Reb...,Unabridged,{'spotify': 'https://open.spotify.com/show/4ez...,https://api.spotify.com/v1/audiobooks/4ez5NQop...,[{'url': 'https://i.scdn.co/image/ab6766630000...,[en],audio,Fourth Wing,"[{'name': 'Rebecca Soler'}, {'name': 'Teddy Ha...",Rebecca Yarros,audiobook,spotify:show:4ez5NQopq7PCIFjdtCtLPu,
2,4QIrNxmBxnpYE68oAuAfM2,Author(s): Ashley Poston\nNarrator(s): Brittan...,False,[{'name': 'Ashley Poston'}],"[AU, BE, FR, GB, IE, LU, NL, NZ]",[],Author(s): Ashley Poston<br/>Narrator(s): Brit...,Unabridged,{'spotify': 'https://open.spotify.com/show/4QI...,https://api.spotify.com/v1/audiobooks/4QIrNxmB...,[{'url': 'https://i.scdn.co/image/ab6766630000...,[en],audio,The Seven Year Slip,[{'name': 'Brittany Pressley'}],Ashley Poston,audiobook,spotify:show:4QIrNxmBxnpYE68oAuAfM2,


In [26]:
response = AudiobooksResponse(audiobooks=validated_audiobook)

In [27]:
flattened_data = response.flatten()

In [28]:
# Print parsed objects
df = pd.DataFrame([audiobook.model_dump() for audiobook in flattened_data])

In [29]:
len(df)

6834

In [30]:
df.head()

Unnamed: 0,id,description,explicit,author_name,available_market,copyright_text,copyright_type,html_description,edition,spotify_external_url,...,image_height,image_width,languages,media_type,book_name,narrator_name,publisher,book_type,book_uri,total_chapters
0,4ez5NQopq7PCIFjdtCtLPu,Author(s): Rebecca Yarros\nNarrator(s): Rebecc...,False,Rebecca Yarros,AU,Recorded Books 2023,C,Author(s): Rebecca Yarros<br/>Narrator(s): Reb...,Unabridged,https://open.spotify.com/show/4ez5NQopq7PCIFjd...,...,640,640,en,audio,Fourth Wing,Rebecca Soler,Rebecca Yarros,audiobook,spotify:show:4ez5NQopq7PCIFjdtCtLPu,
1,4ez5NQopq7PCIFjdtCtLPu,Author(s): Rebecca Yarros\nNarrator(s): Rebecc...,False,Rebecca Yarros,AU,Recorded Books 2023,C,Author(s): Rebecca Yarros<br/>Narrator(s): Reb...,Unabridged,https://open.spotify.com/show/4ez5NQopq7PCIFjd...,...,640,640,en,audio,Fourth Wing,Teddy Hamilton,Rebecca Yarros,audiobook,spotify:show:4ez5NQopq7PCIFjdtCtLPu,
2,4ez5NQopq7PCIFjdtCtLPu,Author(s): Rebecca Yarros\nNarrator(s): Rebecc...,False,Rebecca Yarros,AU,Recorded Books 2023,C,Author(s): Rebecca Yarros<br/>Narrator(s): Reb...,Unabridged,https://open.spotify.com/show/4ez5NQopq7PCIFjd...,...,300,300,en,audio,Fourth Wing,Rebecca Soler,Rebecca Yarros,audiobook,spotify:show:4ez5NQopq7PCIFjdtCtLPu,
3,4ez5NQopq7PCIFjdtCtLPu,Author(s): Rebecca Yarros\nNarrator(s): Rebecc...,False,Rebecca Yarros,AU,Recorded Books 2023,C,Author(s): Rebecca Yarros<br/>Narrator(s): Reb...,Unabridged,https://open.spotify.com/show/4ez5NQopq7PCIFjd...,...,300,300,en,audio,Fourth Wing,Teddy Hamilton,Rebecca Yarros,audiobook,spotify:show:4ez5NQopq7PCIFjdtCtLPu,
4,4ez5NQopq7PCIFjdtCtLPu,Author(s): Rebecca Yarros\nNarrator(s): Rebecc...,False,Rebecca Yarros,AU,Recorded Books 2023,C,Author(s): Rebecca Yarros<br/>Narrator(s): Reb...,Unabridged,https://open.spotify.com/show/4ez5NQopq7PCIFjd...,...,64,64,en,audio,Fourth Wing,Rebecca Soler,Rebecca Yarros,audiobook,spotify:show:4ez5NQopq7PCIFjdtCtLPu,
