In [162]:
import pandas as pd
import numpy as np
import re
from word2number import w2n
from datetime import datetime

In [163]:
def load_data(file_path):
    return pd.read_csv(file_path)

In [164]:
def handle_duplicates(df):
    df = df.drop_duplicates(subset='member_id', keep='first')
    return df

In [165]:
def handle_nulls(df):
    df['first_name'] = df['first_name'].fillna('first_name')
    return df

In [166]:
def fix_date_format(df):
    df['start_date'] = pd.to_datetime(df['start_date'])
    return df

In [167]:
def convert_written_to_number(text):
    try:
        # Check if text is numeric
        if pd.notna(text) and re.match(r'^-?\d+(?:,\d+)?(?:\.\d+)?$', text.strip()):
            return float(text.replace(',', ''))  # Convert numeric text to float

        # Convert written numbers to actual numbers using word2number library
        return w2n.word_to_num(text) if text != 'NaN' else None
    
    except ValueError:
        return text  # Return original text if conversion fails

In [168]:
def convert_salaries(df):
    # Convert mixed numbers and written numbers to actual numbers in 'values' column
    df['salary'] = df['salary'].apply(convert_written_to_number)
    return df

In [169]:
def calculate_length_worked(df, current_date=None): # Create a column showing length of years worked
    if current_date is None:
        current_date = pd.to_datetime(datetime.now())
    # Calculate length worked in years
    df['length_worked_yrs'] = (current_date - pd.to_datetime(df['start_date'])).dt.days / 365.25
    
    # Handle cases where 'start_date' might be NaT (Not a Time)
    df['length_worked_yrs'] = df['length_worked_yrs'].fillna(0)
    
    return df

In [170]:
def set_index(df):
    df.set_index('member_id', inplace=True)
    return df

In [171]:
def remove_salary_placeholders(df):
    # Remove placeholder salaries to avoid skewed statistics in salary analysis.
    df = df[df['salary'] != 50000]
    return df

In [172]:
def save_cleaned_data(df, output_path):
    df.to_csv(output_path, index=True)

In [173]:
def main(input_path, output_path):
    # main function to clean the data
    df = load_data(input_path)
    df = handle_duplicates(df)
    df = handle_nulls(df)
    df = fix_date_format(df)
    df = convert_salaries(df)
    df = calculate_length_worked(df)
    df = set_index(df)
    df = remove_salary_placeholders(df)
    save_cleaned_data(df, output_path)

In [174]:
if __name__ == "__main__":
    input_path = 'census.csv'
    output_path = 'cleaned_census.csv'
    main(input_path, output_path)