# Audible Data

## Import & Data Call

In [58]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import polars as pl
import re

In [59]:
df = pl.read_parquet("./Data/Raw/audible_uncleaned.parquet", use_pyarrow=True)

In [60]:
df.head(5)

name,author,narrator,time,releasedate,language,stars,price
str,str,str,str,str,str,str,str
"""Geronimo Stilt…","""Writtenby:Gero…","""Narratedby:Bil…","""2 hrs and 20 m…","""04-08-08""","""English""","""5 out of 5 sta…","""468.00"""
"""The Burning Ma…","""Writtenby:Rick…","""Narratedby:Rob…","""13 hrs and 8 m…","""01-05-18""","""English""","""4.5 out of 5 s…","""820.00"""
"""The Deep End""","""Writtenby:Jeff…","""Narratedby:Dan…","""2 hrs and 3 mi…","""06-11-20""","""English""","""4.5 out of 5 s…","""410.00"""
"""Daughter of th…","""Writtenby:Rick…","""Narratedby:Son…","""11 hrs and 16 …","""05-10-21""","""English""","""4.5 out of 5 s…","""615.00"""
"""The Lightning …","""Writtenby:Rick…","""Narratedby:Jes…","""10 hrs""","""13-01-10""","""English""","""4.5 out of 5 s…","""820.00"""


## Webscraping USD to INR Exchange Rate

In [68]:
desired = 'USD'
origin = 'INR'
weblink = f'https://www.xe.com/currencyconverter/convert/?Amount=1&From={origin}&To={desired}'
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(weblink)
soup = BeautifulSoup(driver.page_source, 'html.parser')
raw_text = soup.find('p', {'class': 'sc-423c2a5f-1 gPUWGS'}).text.strip()
conversion_rate = float(re.findall(r'\d+\.\d+', raw_text)[0])
driver.quit()

## Cleaning Data

In [62]:
def extract_time(time_str):
    time_str = time_str.lower()
    if 'lethan1ute' in time_str or 'less than 1 minute' in time_str:
        return 0, 0.5
    hours = re.search(r'(\d+)\s*hr', time_str)
    minutes = re.search(r'(\d+)\s*min', time_str)
    hours = float(hours.group(1)) if hours else 0
    minutes = float(minutes.group(1)) if minutes else 0
    return hours, minutes


In [63]:
cleaned = df\
    .with_columns(
        
        pl.col("author")
            .str.replace_all("Writtenby:", "")
            .str.replace_all(r"([a-z])([A-Z])", r"$1 $2")
            .str.split(","),
            
        pl.col("narrator")
            .str.replace_all("Narratedby:", "")
            .str.replace_all(r"([a-z])([A-Z])", r"$1 $2")
            .str.split(","),
            
        ((pl.col("time")\
            .replace(r"Lethan1ute|Less than 1 minute", None)\
            .replace(r"s| and ", "")\
            .map_elements(lambda y: extract_time(y)[0]) * 60)
        +
        pl.col("time")\
            .replace(r"Lethan1ute|Less than 1 minute", None)\
            .replace(r"s| and ", "")\
            .map_elements(lambda y: extract_time(y)[1])).alias("Runtime"),
        
        pl.col("releasedate").str.to_date("%d-%m-%y"),
        
        pl.col("price")
            .str.replace_all(",", "")
            .replace("Free", None)
            .cast(pl.Float64),
            
        pl.col("language")
            .str.to_titlecase(),
            
        pl.col("stars")
            .str.replace_all(r" out of 5 | rating|s", "")
            .replace("Not rated yet", None)
            .str.splitn("tar", 2)
            .struct.rename_fields(["Rating", "Reviews"])
            .alias("stars"),
    )\
    .unnest("stars")\
    .with_columns(
        (pl.col("price") * conversion_rate).round(2).alias("Price USD"))\
    .drop("time")\
    .with_columns(pl.col("author").list.len().alias("Authors Count"),
                  pl.col("narrator").list.len().alias("Narrators Count"),
                  pl.col("Rating").cast(pl.Float32),
                  pl.col("Reviews").str.replace(",", "").cast(pl.Int64))\
    .rename({"price": "Price INR",
             "name": "Title",
             "author": "Author",
             "narrator": "Narrator",
             "releasedate": "Release Date",
             "language": "Language"})

In [64]:
cleaned.head(5)

Title,Author,Narrator,Release Date,Language,Rating,Reviews,Price INR,Runtime,Price USD,Authors Count,Narrators Count
str,list[str],list[str],date,str,f32,i64,f64,f64,f64,u32,u32
"""Geronimo Stilt…","[""Geronimo Stilton""]","[""Bill Lobely""]",2008-08-04,"""English""",5.0,34,468.0,140.0,5.58,1,1
"""The Burning Ma…","[""Rick Riordan""]","[""Robbie Daymond""]",2018-05-01,"""English""",4.5,41,820.0,788.0,9.77,1,1
"""The Deep End""","[""Jeff Kinney""]","[""Dan Russell""]",2020-11-06,"""English""",4.5,38,410.0,123.0,4.89,1,1
"""Daughter of th…","[""Rick Riordan""]","[""Soneela Nankani""]",2021-10-05,"""English""",4.5,12,615.0,676.0,7.33,1,1
"""The Lightning …","[""Rick Riordan""]","[""Jesse Bernstein""]",2010-01-13,"""English""",4.5,181,820.0,600.0,9.77,1,1


## Saving Data

In [65]:
cleaned.write_parquet("./Data/Clean/audible_polars.parquet", use_pyarrow=True)

In [66]:
cleaned = pl.read_parquet("./Data/Clean/audible_polars.parquet", use_pyarrow=True)

In [67]:
cleaned

Title,Author,Narrator,Release Date,Language,Rating,Reviews,Price INR,Runtime,Price USD,Authors Count,Narrators Count
str,list[str],list[str],date,str,f32,i64,f64,f64,f64,u32,u32
"""Geronimo Stilt…","[""Geronimo Stilton""]","[""Bill Lobely""]",2008-08-04,"""English""",5.0,34,468.0,140.0,5.58,1,1
"""The Burning Ma…","[""Rick Riordan""]","[""Robbie Daymond""]",2018-05-01,"""English""",4.5,41,820.0,788.0,9.77,1,1
"""The Deep End""","[""Jeff Kinney""]","[""Dan Russell""]",2020-11-06,"""English""",4.5,38,410.0,123.0,4.89,1,1
"""Daughter of th…","[""Rick Riordan""]","[""Soneela Nankani""]",2021-10-05,"""English""",4.5,12,615.0,676.0,7.33,1,1
"""The Lightning …","[""Rick Riordan""]","[""Jesse Bernstein""]",2010-01-13,"""English""",4.5,181,820.0,600.0,9.77,1,1
"""The Hunger Gam…","[""Suzanne Collins""]","[""Tatiana Maslany""]",2018-10-30,"""English""",5.0,72,656.0,635.0,7.82,1,1
"""Quest for the …","[""Winter Morgan""]","[""Luke Daniels""]",2014-11-25,"""English""",5.0,11,233.0,143.0,2.78,1,1
"""The Dark Proph…","[""Rick Riordan""]","[""Robbie Daymond""]",2017-05-02,"""English""",5.0,50,820.0,752.0,9.77,1,1
"""Merlin Mission…","[""Mary Pope Osborne""]","[""Mary Pope Osborne""]",2017-05-02,"""English""",5.0,5,1256.0,656.0,14.97,1,1
"""The Tyrant’s T…","[""Rick Riordan""]","[""Robbie Daymond""]",2019-09-24,"""English""",5.0,58,820.0,802.0,9.77,1,1
