In [65]:
import pandas as pd
import re

In [66]:
# Load the csv file into audio_book variable
audio_book = pd.read_csv('project_dataset.csv')

In [67]:
# Function that cleans Author name
def author_clean():
    audio_book['author'] = audio_book['author'].str.replace('Writtenby:', '')
    audio_book['author'] = audio_book['author'].str.replace(r'([a-z])([A-Z])', r'\1 \2', regex=True)

In [68]:
# Function that cleans Narrator name
def narrator_clean():
    audio_book['narrator'] = audio_book['narrator'].str.replace('Narratedby:', '')
    audio_book['narrator'] = audio_book['narrator'].str.replace(r'([a-z])([A-Z])', r'\1 \2', regex=True)

In [69]:
# Function that converts hours into minutes
def minutes():
    audio_book['hours'] = pd.to_numeric(audio_book['time'].str.extract(r'(\d+) hr')[0], errors='coerce').fillna(0).astype(int)
    audio_book['temp_minutes'] = pd.to_numeric(audio_book['time'].str.extract(r'(\d+) min')[0], errors='coerce').fillna(0).astype(int)
    audio_book['minutes'] = audio_book['hours'] * 60 + audio_book['temp_minutes']
    audio_book.drop(columns=['hours', 'temp_minutes'], inplace=True)

In [70]:
# Function that validates date as right format
def release_date():
    audio_book['releasedate'] = pd.to_datetime(audio_book['releasedate'], errors='coerce')

In [71]:
# Function that finds and extracts the float value of the rating
def extract_rating(r):
    if " out of " in r:
        parts = r.split()
        for part in parts:
            try:
                rating = float(part)
                return rating
            except ValueError:
                pass

    try:
        rating = float(r)
        return rating
    except ValueError:
        return 'Not rated yet'
    

def rating():
    audio_book['rating']  = audio_book['stars'].apply(lambda x: extract_rating(x))

In [72]:
# Function that exracts total number of ratings from the star column
def extract_number_from_string(s):
    pattern = r'\d+' 
    matches = re.findall(pattern, s)
    return int(matches[-1]) if matches else 'Not rated yet'


def no_of_rating():
    audio_book['number_of_ratings']  = audio_book['stars'].apply(lambda x: extract_number_from_string(x))

In [73]:
# Function that converts price column data to float
def convert_to_float(value):
    try:
        return pd.to_numeric(value.replace(',', ''))
    except ValueError:
        return value

def price_to_float():
    audio_book['price'] = audio_book['price'].apply(convert_to_float)

In [74]:
# Function that outputs the cleaned csv file
def output_csv():
    audio_book.to_csv('project_dataset_clean.csv ', index=False)

In [75]:
# Pipeline function that executes the entire cleaning process sequentially
def pipeline():
    author_clean()
    narrator_clean()
    minutes()
    release_date()
    rating()
    no_of_rating()
    price_to_float()
    output_csv()

In [76]:
if __name__ == "__main__":
    pipeline()