In [27]:
import pandas as pd
df=pd.read_csv("AWCustomers.csv")
print(df)

       CustomerID Title FirstName MiddleName  LastName Suffix  \
0           21173   NaN      Chad          C      Yuan    NaN   
1           13249   NaN      Ryan        NaN     Perry    NaN   
2           29350   NaN     Julia        NaN  Thompson    NaN   
3           13503   NaN  Theodore        NaN     Gomez    NaN   
4           22803   NaN  Marshall          J      Shan    NaN   
...           ...   ...       ...        ...       ...    ...   
18356       25414   NaN     Grace          C    Bailey    NaN   
18357       11459   NaN     Tasha        NaN      Deng    NaN   
18358       12160   NaN    Jaclyn        NaN     Zhang    NaN   
18359       14353   NaN      Erin          I      Reed    NaN   
18360       16676   NaN    Amanda        NaN     Perry    NaN   

                 AddressLine1 AddressLine2            City  \
0          7090 C. Mount Hood          NaN      Wollongong   
1         3651 Willow Lake Rd          NaN         Shawnee   
2      1774 Tice Valley Blvd.    

In [28]:
selectedcolumns=["FirstName","Occupation","City","MaritalStatus","Gender","NumberChildrenAtHome","TotalChildren","YearlyIncome","NumberCarsOwned","HomeOwnerFlag","Occupation","BirthDate","PostalCode"]
new_df=df[selectedcolumns].copy()
print(new_df)

      FirstName      Occupation            City MaritalStatus Gender  \
0          Chad        Clerical      Wollongong             M      M   
1          Ryan        Clerical         Shawnee             M      M   
2         Julia        Clerical     West Covina             S      F   
3      Theodore  Skilled Manual       Liverpool             M      M   
4      Marshall  Skilled Manual           Werne             S      M   
...         ...             ...             ...           ...    ...   
18356     Grace  Skilled Manual        Coronado             M      F   
18357     Tasha  Skilled Manual  Port Macquarie             S      F   
18358    Jaclyn  Skilled Manual       Beaverton             S      F   
18359      Erin        Clerical       Vancouver             S      F   
18360    Amanda    Professional       Grossmont             M      F   

       NumberChildrenAtHome  TotalChildren  YearlyIncome  NumberCarsOwned  \
0                         0              1         81916  

In [29]:
datavalue={
    "FirstName":('Discrete','Nominal'),
    "Occupation": ('Discrete' ,'Nominal'),
    "City":('Discrete','Nominal'),
    "MaritalStatus":('Discrete','Nominal'),
    "Gender":('Discrete','Nominal'),
    "NumberChildrenAtHome":('Discrete','Ratio'),
    "TotalChildren":('Discrete','Ratio'),
    "YearlyIncome":('Continuous','Ratio'),
    "NumberCarsOwned":('Discrete','Ratio'),
    "HomeOwnerFlag":('Discrete','Nominal'),
    "Occupation":('Discrete','Nominal'),
    "BirthDate":('Continuous','Interval'),
    "PostalCode":('Discrete','Nominal')
}

In [30]:
print(new_df.isnull())

       FirstName  Occupation   City  MaritalStatus  Gender  \
0          False       False  False          False   False   
1          False       False  False          False   False   
2          False       False  False          False   False   
3          False       False  False          False   False   
4          False       False  False          False   False   
...          ...         ...    ...            ...     ...   
18356      False       False  False          False   False   
18357      False       False  False          False   False   
18358      False       False  False          False   False   
18359      False       False  False          False   False   
18360      False       False  False          False   False   

       NumberChildrenAtHome  TotalChildren  YearlyIncome  NumberCarsOwned  \
0                     False          False         False            False   
1                     False          False         False            False   
2                     Fa

In [33]:
# Handling Nulls
new_df = new_df.copy()
#Fill numeric columns with median
num_cols = new_df.select_dtypes(include=[np.number]).columns
new_df[num_cols] = new_df[num_cols].fillna(new_df[num_cols].median())

# Fill categorical columns with mode
cat_cols = new_df.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
    mode_val = new_df[col].mode()
    if not mode_val.empty:   # if mode exists
        new_df[col] = new_df[col].fillna(mode_val.iloc[0])
    else:
        # fallback: fill with a placeholder
        new_df[col] = new_df[col].fillna("Unknown")

#Normalization

from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
numeric_cols = new_df.select_dtypes(include=[np.number]).columns

new_df[numeric_cols] = scaler.fit_transform(new_df[numeric_cols])


#Discretization(Binning)
new_df['Income_bin'] = pd.cut(new_df['YearlyIncome'], bins=3, labels=['Low','Medium','High'])



#Standardization
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
new_df[numeric_cols] = scaler_std.fit_transform(new_df[numeric_cols])

#Binarization
new_df = pd.get_dummies(new_df, drop_first=True)
print(new_df)

       NumberChildrenAtHome  TotalChildren  YearlyIncome  NumberCarsOwned  \
0                 -0.594371       0.161342      0.298555         1.892524   
1                  1.163279       1.239753      0.271180         0.798389   
2                 -0.594371      -0.917069      0.444261         1.892524   
3                  1.163279       1.239753     -0.367401         0.798389   
4                 -0.594371      -0.917069     -0.682765        -0.295746   
...                     ...            ...           ...              ...   
18356             -0.594371       1.239753     -0.645321        -0.295746   
18357             -0.594371      -0.917069     -0.383337         0.798389   
18358             -0.594371      -0.917069     -0.680973         0.798389   
18359             -0.594371      -0.917069      0.470006        -1.389881   
18360             -0.594371      -0.917069      2.088996         0.798389   

       HomeOwnerFlag  FirstName_Abby  FirstName_Abhijit  FirstName_Abigail 

In [36]:
#Calculation proximity
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

obj1 = new_df.iloc[0].values.reshape(1,-1)
obj2 = new_df.iloc[1].values.reshape(1,-1)

# Cosine Similarity
cos_sim = cosine_similarity(obj1, obj2)[0][0]

# Jaccard Similarity (binary columns only)
jac_sim = 1 - jaccard(new_df.iloc[0].values, new_df.iloc[1].values)

# Simple Matching Coefficient (SMC)
smc = (new_df.iloc[0] == new_df.iloc[1]).sum() / len(new_df.columns)

print("Cosine:", cos_sim)
print("Jaccard:", jac_sim)
print("SMC:", smc)


#Correlation analysis - numeric
corr = new_df['TotalChildren'].corr(new_df['YearlyIncome'])
print("Correlation:", corr)

Cosine: 0.35735660161840993
Jaccard: 0.19999999999999996
SMC: 0.9987375065754865
Correlation: 0.022013822892024196
