In [3]:
#assignment 2
import pandas as pd
from google.colab import files
df = pd.read_csv('/content/AWCustomers.csv')
num_rows = df.shape[0]
num_cols = df.shape[1];

Part I: Based on Feature Selection, Cleaning, and Preprocessing to Construct an Input from Data Source

In [4]:
#a)
#finding names of the columns
column_names_list = df.columns.tolist()
print("Column Names List:", column_names_list)
df_new = df.drop(['Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix','AddressLine1', 'AddressLine2','LastUpdated'], axis=1, inplace=True)


Column Names List: ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']


In [5]:
attribute_types = {
    "City":               {"Type": "Discrete",    "Scale": "Nominal"},
    "StateProvinceName":  {"Type": "Discrete",    "Scale": "Nominal"},
    "CountryRegionName":  {"Type": "Discrete",    "Scale": "Nominal"},
    "Education":          {"Type": "Discrete",    "Scale": "Ordinal"},
    "Occupation":         {"Type": "Discrete",    "Scale": "Nominal"},
    "Gender":             {"Type": "Discrete",    "Scale": "Nominal"},
    "MaritalStatus":      {"Type": "Discrete",    "Scale": "Nominal"},
    "HomeOwnerFlag":      {"Type": "Discrete",    "Scale": "Nominal"},
    "NumberCarsOwned":    {"Type": "Discrete",    "Scale": "Ratio"},
    "NumberChildrenAtHome": {"Type": "Discrete",  "Scale": "Ratio"},
    "TotalChildren":      {"Type": "Discrete",    "Scale": "Ratio"},
    "YearlyIncome":       {"Type": "Continuous",  "Scale": "Ratio"},
    "Age":                {"Type": "Continuous",  "Scale": "Ratio"},
}
df_attribute_types = pd.DataFrame(attribute_types).T
print(df_attribute_types)


                            Type    Scale
City                    Discrete  Nominal
StateProvinceName       Discrete  Nominal
CountryRegionName       Discrete  Nominal
Education               Discrete  Ordinal
Occupation              Discrete  Nominal
Gender                  Discrete  Nominal
MaritalStatus           Discrete  Nominal
HomeOwnerFlag           Discrete  Nominal
NumberCarsOwned         Discrete    Ratio
NumberChildrenAtHome    Discrete    Ratio
TotalChildren           Discrete    Ratio
YearlyIncome          Continuous    Ratio
Age                   Continuous    Ratio


Part II: Data Preprocessing and Transformation

In [6]:
#a) handling null values --
null_columns = df.isnull().sum()
null_columns = null_columns[null_columns > 0]
print("Columns with null values:")
print(null_columns)

Columns with null values:
Series([], dtype: int64)


In [7]:
import numpy as np
df.loc[np.random.choice(df.index, size=20, replace=False),'NumberCarsOwned'] = np.nan
df.loc[np.random.choice(df.index, size=15, replace=False),'YearlyIncome'] = np.nan
df.loc[np.random.choice(df.index, size=2, replace=False), 'Education' ] = np.nan
df.loc[np.random.choice(df.index, size=4, replace=False), 'MaritalStatus' ] = np.nan


In [8]:
df['YearlyIncome'] = df['YearlyIncome'].fillna(df['YearlyIncome'].mean())
df['NumberCarsOwned'] = df['NumberCarsOwned'].fillna(df['NumberCarsOwned'].median())
df['MaritalStatus'] = df['MaritalStatus'].fillna(df['MaritalStatus'].mode())
df['Education'] = df['Education'].fillna(df['Education'].mode())

In [10]:
categorical_cols = ['City', 'StateProvinceName', 'CountryRegionName',
                    'Education', 'Occupation', 'Gender', 'MaritalStatus']

numeric_cols = ['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome',
                'TotalChildren', 'YearlyIncome']



In [11]:
print(numeric_cols)


['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome']


In [13]:
#b and e) Normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


In [12]:
#c) Discretization (Binning Continuous Attributes)
# Income groups (quartiles)
df['Income_Bin'] = pd.qcut(df['YearlyIncome'], q=4,
                           labels=['Low', 'Medium', 'High', 'VeryHigh'])


In [14]:
#d)
final_categorical = ['City', 'StateProvinceName', 'CountryRegionName',
                     'Education', 'Occupation', 'Gender', 'MaritalStatus',
                      'Income_Bin']

df_final = pd.get_dummies(df, columns=final_categorical, drop_first=True)


Part III: Calculating Proximity /Correlation Analysis of two features::


In [15]:
df_clean = df_final.drop(columns=['PostalCode', 'PhoneNumber', 'BirthDate'])
df_clean = df_clean.select_dtypes(include=[np.number])
obj1 = df_clean.iloc[0].to_numpy()
obj2 = df_clean.iloc[1].to_numpy()


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity([obj1], [obj2])[0][0]
print("Cosine Similarity:", round(cos_sim, 4))


Cosine Similarity: 1.0


In [17]:
binary_cols = df_clean.columns[(df_clean.nunique() == 2)]
df_binary = df_clean[binary_cols]
binary_cols = df_clean.columns[(df_clean.nunique() == 2)]
df_binary = df_clean[binary_cols]
obj1_bin = df_binary.iloc[0].to_numpy()
obj2_bin = df_binary.iloc[1].to_numpy()


In [18]:
M11 = np.sum((obj1_bin == 1) & (obj2_bin == 1))
M00 = np.sum((obj1_bin == 0) & (obj2_bin == 0))
M10 = np.sum((obj1_bin == 1) & (obj2_bin == 0))
M01 = np.sum((obj1_bin == 0) & (obj2_bin == 1))

denom_smc = M11 + M10 + M01 + M00
denom_jaccard = M11 + M10 + M01

smc = (M11 + M00) / denom_smc if denom_smc != 0 else np.nan
jaccard = M11 / denom_jaccard if denom_jaccard != 0 else np.nan

print("Simple Matching Coefficient:", round(smc, 4))
print("Jaccard Similarity:", round(jaccard, 4))


Simple Matching Coefficient: nan
Jaccard Similarity: nan
