# Audible Data

## Import & Data Call

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import re
import numpy as np

In [2]:
df = pd.read_parquet("./Data/Raw/audible_uncleaned.parquet")

## Webscraping USD to INR Exchange Rate

In [3]:
desired = 'USD'
origin = 'INR'
weblink = f'https://www.xe.com/currencyconverter/convert/?Amount=1&From={origin}&To={desired}'
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(weblink)
soup = BeautifulSoup(driver.page_source, 'html.parser')
raw_text = soup.find('p', {'class': 'sc-423c2a5f-1 gPUWGS'}).text.strip()
conversion_rate = float(re.findall(r'\d+\.\d+', raw_text)[0])
driver.quit()

In [4]:
df.head(3)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0


## Cleaning Data

In [5]:
def extract_time(time_str):
    time_str = time_str.lower()
    if 'lethan1ute' in time_str or 'less than 1 minute' in time_str:
        return 0, 0.5
    hours = re.search(r'(\d+)\s*hr', time_str)
    minutes = re.search(r'(\d+)\s*min', time_str)
    hours = float(hours.group(1)) if hours else 0
    minutes = float(minutes.group(1)) if minutes else 0
    return hours, minutes


In [6]:
cleaned = df\
        .copy().assign(
            
            author=lambda x: x['author']
                .str.replace("Writtenby:", "", regex=True)
                .str.replace(r'([a-z])([A-Z])', r'\1 \2', regex=True)
                .str.split(","),

            narrator=lambda x: x['narrator']
                .str.replace("Narratedby:", "", regex=True)
                .str.replace(r'([a-z])([A-Z])', r'\1 \2', regex=True)
                .str.split(","),

            language=lambda x: x['language'].str.title(),

            releasedate=lambda x: pd.to_datetime(x['releasedate'], format='%d-%m-%y'),
            
            Rating=lambda x: x['stars']
                .str.replace(r' rating|out of 5 star|s', '', regex=True)
                .where(x['stars'] != 'Not rated yet', np.nan)
                .str.split(" ", n=2, expand=True)[0].astype(pd.Float32Dtype()),
                
            Reviews=lambda x: x['stars']
                .str.replace(r' rating|out of 5 star|s|,', '', regex=True)
                .where(x['stars'] != 'Not rated yet', np.nan)
                .str.split(" ", n=2, expand=True)[1].astype(pd.Int64Dtype()),
            
            Runtime=lambda x: ((x['time'].apply(lambda y: extract_time(y)[0])) * 60 +
                               x['time'].apply(lambda y: extract_time(y)[1])),

            
            price=lambda x: x['price']
                .str.replace(",", "")
                .str.replace("Free", "0")
                .astype(pd.Float64Dtype()),
            
            usd=lambda x: (x["price"]
                .astype(str)                       
                .str.replace(",", "")
                .str.replace("Free", "0")
                .replace("nan", "0")
                .fillna("0")
                .astype(float)
                * conversion_rate).round(2))\
        .drop(['stars', 'time'], axis=1)\
        .rename(columns={
                    "name": "Title",
                    "author": "Author",
                    "narrator": "Narrator",
                    "releasedate": "Release Date",
                    "language": "Language",
                    "price": "INR Price",
                    "usd": "USD Price"})

## Saving Data

In [7]:
cleaned.to_parquet("./Data/Clean/audible_pandas.parquet")