In [108]:
import pandas as pd
import numpy as np
import re
from word2number import w2n

In [109]:
def load_data(file_path):
    return pd.read_csv(file_path, parse_dates=True)

In [110]:
def handle_duplicates(df):
    df = df.drop_duplicates(subset='member_id', keep='first')
    return df

In [111]:
def handle_nulls(df):
    df['first_name'] = df['first_name'].fillna('first_name')
    return df

In [112]:
def fix_date_format(df):
    df['start_date'] = pd.to_datetime(df['start_date'])
    return df

In [113]:
def convert_written_to_number(text):
    try:
        # Check if text is numeric
        if pd.notna(text) and re.match(r'^-?\d+(?:,\d+)?(?:\.\d+)?$', text.strip()):
            return float(text.replace(',', ''))  # Convert numeric text to float

        # Convert written numbers to actual numbers using word2number library
        return w2n.word_to_num(text) if text != 'NaN' else None
    
    except ValueError:
        return text  # Return original text if conversion fails

In [114]:
def convert_salaries(df):
    # Convert mixed numbers and written numbers to actual numbers in 'values' column
    df['salary'] = df['salary'].apply(convert_written_to_number)
    return df

In [115]:
def set_index(df):
    df.set_index('member_id', inplace=True)
    return df

In [116]:
def save_cleaned_data(df, output_path):
    df.to_csv(output_path, index=True)

In [117]:
def main(input_path, output_path):
    # main function to clean the data
    df = load_data(input_path)
    df = handle_duplicates(df)
    df = handle_nulls(df)
    df = fix_date_format(df)
    df = convert_salaries(df)
    df = set_index(df)
    print(df.isnull().sum())
    print(f"start_date dtype in cleaned dataframe before saving: {df['start_date'].dtype}")
    save_cleaned_data(df, output_path)
    df_verified = pd.read_csv(output_path)
    print(f"start_date dtype after reloading: {df_verified['start_date'].dtype}")

In [118]:
if __name__ == "__main__":
    input_path = 'census.csv'
    output_path = 'cleaned_census.csv'
    main(input_path, output_path)

first_name          0
last_name           0
address             0
local_number        0
salary              0
employer            0
start_date          0
actively_working    0
dtype: int64
start_date dtype in cleaned dataframe before saving: datetime64[ns]
start_date dtype after reloading: object
