# Raj Pawar - Data Cleansing Notebook

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

# Step 1: Load Data with Dask
**Dask will read the CSV in parallel chunks**

**dtype overrides help avoid mixed-type errors**

In [12]:
dtypes = {
    'age': 'object',
    'antiguedad': 'object',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

In [3]:
df = dd.read_csv(r"C:\After Backup\Virtual Internship\Week 7 Project\Cross Selling\Cross Selling\Train.csv", dtype=dtypes, assume_missing=True)

# Step 2: Clean 'age' and 'antiguedad'
**Strip spaces and convert to numeric**

In [4]:
df['age'] = df['age'].str.strip().replace('NA', np.nan).astype(float)

In [6]:
df['antiguedad'] = df['antiguedad'].str.strip().replace('NA', np.nan).astype(float)

# Step 3: Impute 'renta' using median

In [13]:
renta_median = df['renta'].quantile(0.5).compute()
df['renta'] = df['renta'].fillna(renta_median)

# Step 4: Impute 'segmento' using mode

In [14]:
segment_mode = df['segmento'].dropna().value_counts().idxmax().compute()
df['segmento'] = df['segmento'].fillna(segment_mode)

# Step 5: Fill high-missing columns with 'Unknown'

In [15]:
df['conyuemp'] = df['conyuemp'].fillna('Unknown')
df['ult_fec_cli_1t'] = df['ult_fec_cli_1t'].fillna('Unknown')

# Step 6: Save Cleaned Output

In [16]:
df.to_csv(r"C:\After Backup\Virtual Internship\Week 9 Project\final_cleaned_raj_dask.csv", single_file=True, index=False)

['C:\\After Backup\\Virtual Internship\\Week 9 Project\\final_cleaned_raj_dask.csv']