## Review by Raj Pawar:
### 1. Pavithra’s approach complements the initial cleaning by applying segment-based imputation and handling remaining inconsistencies
### 2. The use of map_partitions and median calculations across segments was efficient and well-suited for large-scale data
### 3. Clipping renta outliers ensures the data is well-bounded for analysis without distortion
### 4. Final cleaned file was reliable and immediately usable for EDA and modeling
### 5. Great continuation from baseline cleaning; strong adherence to business-context-driven preprocessing

# -------------------------------------------------------------------------------------------

# Naga Pavithra Jajala - Data Cleansing Notebook

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [2]:
dtypes = {
    'age': 'object',
    'antiguedad': 'object',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

df = dd.read_csv(r"C:\Internship Project\Train.csv", dtype=dtypes, assume_missing=True)

# Segmented Median Imputation for Remaining Missing Values

In [3]:
def segment_median_fill(df):
    medians = df.groupby('segmento')['renta'].median().compute()
    return df.map_partitions(lambda d: d.assign(
        renta=d['renta'].fillna(d['segmento'].map(medians))
    ), meta=df)

df = segment_median_fill(df)

In [4]:
import numpy as np
import pandas as pd

for col in ['canal_entrada', 'tiprel_1mes', 'indrel_1mes']:
    print(f"\nProcessing column: {col}")

    try:
        # Clean the column
        df[col] = df[col].str.strip().replace("NA", np.nan)

        # Convert to Pandas Series safely
        series = df[[col]].dropna().compute()
        series = series.reset_index(drop=True)

        # Compute mode
        mode_val = series[col].mode().iloc[0]
        print(f"Mode for {col}: {mode_val}")

        # Fill missing values with the mode
        df[col] = df[col].fillna(mode_val)

    except Exception as e:
        print(f"Error while processing {col}: {e}")
        df[col] = df[col].fillna("Unknown")


Processing column: canal_entrada
Mode for canal_entrada: KHE

Processing column: tiprel_1mes
Mode for tiprel_1mes: I

Processing column: indrel_1mes
Mode for indrel_1mes: 1.0


In [5]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

df.to_csv("first_cleaned.csv", single_file=True, index=False)

[########################################] | 100% Completed | 440.84 s


['C:\\Users\\Raj\\VI - Week 8.1\\first_cleaned.csv']

# Outlier Treatment for 'renta'

In [2]:
dtypes = {
    'age': 'object',
    'antiguedad': 'object',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

df = dd.read_csv("first_cleaned.csv", dtype=dtypes, assume_missing=True)

In [3]:
# IQR Method - removing extreme top 1%
q1 = df['renta'].quantile(0.25).compute()
q3 = df['renta'].quantile(0.75).compute()
iqr = q3 - q1
upper_bound = q3 + 1.5 * iqr

# Clipping 'renta' at upper bound
print("Clipping renta above upper bound:", upper_bound)
df['renta'] = df['renta'].clip(upper=upper_bound)

Clipping renta above upper bound: 273663.8925000001


In [4]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

df.to_csv("Second_cleaned.csv", single_file=True, index=False)

[########################################] | 100% Completed | 417.14 s


['C:\\Users\\Raj\\VI - Week 8.1\\Second_cleaned.csv']

# Cleaning 'age' and 'antiguedad' for analysis

In [2]:
dtypes = {
    'age': 'object',
    'antiguedad': 'object',
    'renta': 'float64',
    'indrel_1mes': 'object',
    'conyuemp': 'object',
    'ult_fec_cli_1t': 'object',
    'tiprel_1mes': 'object',
    'canal_entrada': 'object',
    'segmento': 'object'
}

df = dd.read_csv("second_cleaned.csv", dtype=dtypes, assume_missing=True)

In [3]:
df['age'] = df['age'].str.strip().replace('NA', np.nan).astype(float)
df['antiguedad'] = df['antiguedad'].str.strip().replace('NA', np.nan).astype(float)

age_medians = df.groupby('segmento')['age'].median().compute()

df = df.map_partitions(
    lambda d: d.assign(age=d['age'].fillna(d['segmento'].map(age_medians))),
    meta=df
)

df['age'] = df['age'].clip(lower=18, upper=100)

In [4]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

df.to_csv("Final_cleaned.csv", single_file=True, index=False)

[########################################] | 100% Completed | 426.79 s


['C:\\Users\\Raj\\VI - Week 8.1\\Final_cleaned.csv']