# Naga Pavithra Jajala - Data Cleansing Notebook

In [None]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [None]:
dtypes = {
    'age': 'object',
    'antiguedad': 'object',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

df = dd.read_csv(r"C:\Internship Project\Train.csv", dtype=dtypes, assume_missing=True)


df['age'] = df['age'].astype(str).str.strip().replace('NA', np.nan).astype(float)
df['antiguedad'] = df['antiguedad'].astype(str).str.strip().replace('NA', np.nan).astype(float)

# Imputing 'renta' Using Segment-Wise Median

In [None]:
def fill_with_segment_median(df):
    medians = df.groupby('segmento')['renta'].median().compute()
    return df.map_partitions(lambda d: d.assign(
        renta=d['renta'].fillna(d['segmento'].map(medians))
    ), meta=df)

df = fill_with_segment_median(df)

# Handling Outliers in 'renta' using IQR Method

In [None]:
q1 = df['renta'].quantile(0.25).compute()
q3 = df['renta'].quantile(0.75).compute()
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df['renta'] = df['renta'].clip(lower=lower_bound, upper=upper_bound)

# Converting 'fecha_alta' to datetime

In [None]:
df['fecha_alta'] = dd.to_datetime(df['fecha_alta'], errors='coerce')

In [None]:
df.to_csv(r"C:/Internship Project/final_cleaned_pavithra_dask.csv", single_file=True, index=False)