In [1]:
import numpy as np
import pandas as pd

In [11]:
customers=pd.read_csv("AWCustomers.csv.zip")
sales=pd.read_csv("AWSales.csv")

In [13]:
df=pd.merge(customers,sales,on="CustomerID",how="inner")
print(df.head())

   CustomerID Title FirstName MiddleName  LastName Suffix  \
0       21173   NaN      Chad          C      Yuan    NaN   
1       13249   NaN      Ryan        NaN     Perry    NaN   
2       29350   NaN     Julia        NaN  Thompson    NaN   
3       13503   NaN  Theodore        NaN     Gomez    NaN   
4       22803   NaN  Marshall          J      Shan    NaN   

             AddressLine1 AddressLine2         City    StateProvinceName  ...  \
0      7090 C. Mount Hood          NaN   Wollongong      New South Wales  ...   
1     3651 Willow Lake Rd          NaN      Shawnee     British Columbia  ...   
2  1774 Tice Valley Blvd.          NaN  West Covina           California  ...   
3         2103 Baldwin Dr          NaN    Liverpool              England  ...   
4         Am Gallberg 234          NaN        Werne  Nordrhein-Westfalen  ...   

  Gender MaritalStatus HomeOwnerFlag NumberCarsOwned NumberChildrenAtHome  \
0      M             M             1               3                 

In [15]:
#Make sure df exists (the merged DataFrame)
df_selected = df[["BirthDate","Education","Occupation","Gender","MaritalStatus","HomeOwnerFlag","NumberCarsOwned","NumberChildrenAtHome","TotalChildren","YearlyIncome","AvgMonthSpend","BikeBuyer"]].copy()

In [17]:
df_selected["BirthDate"]=pd.to_datetime(df_selected["BirthDate"])
today=pd.to_datetime("today")
df_selected["Age"]=today.year-df_selected["BirthDate"].dt.year
df_selected.drop(columns=["BirthDate"],inplace=True)

In [19]:
print(df_selected.head())

         Education      Occupation Gender MaritalStatus  HomeOwnerFlag  \
0        Bachelors        Clerical      M             M              1   
1  Partial College        Clerical      M             M              1   
2        Bachelors        Clerical      F             S              0   
3  Partial College  Skilled Manual      M             M              1   
4  Partial College  Skilled Manual      M             S              1   

   NumberCarsOwned  NumberChildrenAtHome  TotalChildren  YearlyIncome  \
0                3                     0              1         81916   
1                2                     1              2         81076   
2                3                     0              0         86387   
3                2                     1              2         61481   
4                1                     0              0         51804   

   AvgMonthSpend  BikeBuyer  Age  
0          50.97          1   38  
1          53.11          1   53  
2          

In [21]:
scale_mapping = {
    "Age": "Continuous (Ratio)",
    "Education": "Discrete (Ordinal)",
    "Occupation": "Discrete (Nominal)",
    "Gender": "Discrete (Nominal)",
    "MaritalStatus": "Discrete (Nominal)",
    "HomeOwnerFlag": "Discrete (Binary)",
    "NumberCarsOwned": "Discrete (Ratio)",
    "NumberChildrenAtHome": "Discrete (Ratio)",
    "TotalChildren": "Discrete (Ratio)",
    "YearlyIncome": "Continuous (Ratio)",
    "AvgMonthSpend": "Continuous (Ratio)",
    "BikeBuyer": "Discrete (Binary)"
}
print("Measurement Scales:")
for col,scale in scale_mapping.items():
    print(f"{col}:{scale}") 

Measurement Scales:
Age:Continuous (Ratio)
Education:Discrete (Ordinal)
Occupation:Discrete (Nominal)
Gender:Discrete (Nominal)
MaritalStatus:Discrete (Nominal)
HomeOwnerFlag:Discrete (Binary)
NumberCarsOwned:Discrete (Ratio)
NumberChildrenAtHome:Discrete (Ratio)
TotalChildren:Discrete (Ratio)
YearlyIncome:Continuous (Ratio)
AvgMonthSpend:Continuous (Ratio)
BikeBuyer:Discrete (Binary)


In [23]:
print(df_selected.shape)

(18361, 12)


In [25]:
print(df_selected)

             Education      Occupation Gender MaritalStatus  HomeOwnerFlag  \
0            Bachelors        Clerical      M             M              1   
1      Partial College        Clerical      M             M              1   
2            Bachelors        Clerical      F             S              0   
3      Partial College  Skilled Manual      M             M              1   
4      Partial College  Skilled Manual      M             S              1   
...                ...             ...    ...           ...            ...   
18356  Graduate Degree  Skilled Manual      F             M              0   
18357        Bachelors  Skilled Manual      F             S              0   
18358  Partial College  Skilled Manual      F             S              0   
18359      High School        Clerical      F             S              0   
18360  Graduate Degree    Professional      F             M              1   

       NumberCarsOwned  NumberChildrenAtHome  TotalChildren  Ye

In [27]:
print(customers.isnull().sum())
print("------------------")
print(sales.isnull().sum())

CustomerID                  0
Title                   18260
FirstName                   0
MiddleName               7789
LastName                    0
Suffix                  18358
AddressLine1                0
AddressLine2            18050
City                        0
StateProvinceName           0
CountryRegionName           0
PostalCode                  0
PhoneNumber                 0
BirthDate                   0
Education                   0
Occupation                  0
Gender                      0
MaritalStatus               0
HomeOwnerFlag               0
NumberCarsOwned             0
NumberChildrenAtHome        0
TotalChildren               0
YearlyIncome                0
LastUpdated                 0
dtype: int64
------------------
CustomerID       0
BikeBuyer        0
AvgMonthSpend    0
dtype: int64


In [29]:
print("null value before handling:")
print(df_selected.isnull().sum())
for col in df_selected.columns:
    if df_selected[col].dtype in ['int64','float64']:
        df_selected[col].fillna(df_selected[col].median(),inplace=True)
    else:
        df_selected[col].fillna(df_selected[col].mode(0),inplace=True)

print("null value after handling:")
print(df_selected.isnull().sum())

null value before handling:
Education               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
AvgMonthSpend           0
BikeBuyer               0
Age                     0
dtype: int64
null value after handling:
Education               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
AvgMonthSpend           0
BikeBuyer               0
Age                     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_selected[col].fillna(df_selected[col].mode(0),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_selected[col].fillna(df_selected[col].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [33]:
num_col=["Age","YearlyIncome","AvgMonthSpend"]
df_selected[num_col]=MinMaxScaler().fit_transform(df_selected[num_col])
print(df_selected[num_col].head())

        Age  YearlyIncome  AvgMonthSpend
0  0.185714      0.496842       0.324210
1  0.400000      0.489453       0.425201
2  0.214286      0.536172       0.470977
3  0.328571      0.317083       0.605474
4  0.357143      0.231958       0.533742


In [35]:
df_selected["income_category"]=pd.cut(df_selected["YearlyIncome"],bins=3,labels=["low","medium","high"])
print(df_selected[["income_category","YearlyIncome"]].head())

  income_category  YearlyIncome
0          medium      0.496842
1          medium      0.489453
2          medium      0.536172
3             low      0.317083
4             low      0.231958


In [37]:
print(df_selected.columns)

Index(['Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'AvgMonthSpend', 'BikeBuyer', 'Age', 'income_category'],
      dtype='object')


In [39]:
from sklearn.preprocessing import StandardScaler

In [41]:
num_col=["Age","YearlyIncome","AvgMonthSpend"]
df_selected[num_col]=StandardScaler().fit_transform(df_selected[num_col])
print(df_selected[num_col].head())

        Age  YearlyIncome  AvgMonthSpend
0 -0.482516      0.298555      -0.231774
1  0.851033      0.271180       0.390755
2 -0.304710      0.444261       0.672929
3  0.406517     -0.367401       1.501999
4  0.584324     -0.682765       1.059828


In [43]:
df_encoded=pd.get_dummies(df_selected,columns=["Education","Occupation","Gender","MaritalStatus","income_category"],dtype=int)
print("encoded dataset shape:",df_encoded.shape)
df_encoded.head()

encoded dataset shape: (18361, 25)


Unnamed: 0,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,AvgMonthSpend,BikeBuyer,Age,Education_Bachelors,Education_Graduate Degree,...,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Gender_F,Gender_M,MaritalStatus_M,MaritalStatus_S,income_category_low,income_category_medium,income_category_high
0,1,3,0,1,0.298555,-0.231774,1,-0.482516,1,0,...,0,0,0,0,1,1,0,0,1,0
1,1,2,1,2,0.27118,0.390755,1,0.851033,0,0,...,0,0,0,0,1,1,0,0,1,0
2,0,3,0,0,0.444261,0.672929,1,-0.30471,1,0,...,0,0,0,1,0,0,1,0,1,0
3,1,2,1,2,-0.367401,1.501999,1,0.406517,0,0,...,0,0,1,0,1,1,0,1,0,0
4,1,1,0,0,-0.682765,1.059828,1,0.584324,0,0,...,0,0,1,0,1,0,1,1,0,0


In [45]:
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
arr1=np.array(df_encoded.iloc[0])
arr2=np.array(df_encoded.iloc[1])

smc=np.sum(arr1==arr2)/len(arr1)

binary_cols=[col for col in df_encoded.columns if set(df_encoded[col].unique())<={0,1}]
jaccard=jaccard_score(df_encoded.iloc[0][binary_cols],df_encoded.iloc[1][binary_cols])

cosine=cosine_similarity(arr1.reshape(1,-1),arr2.reshape(1,-1))[0][0]

print(f"SMC: {smc}")
print(f"Jaccard Similarity:{jaccard}")
print(f"Cosine Similarity:{cosine}")

SMC: 0.68
Jaccard Similarity:0.75
Cosine Similarity:0.7912798092926715


In [55]:
if "CommuteDistance" in df.columns:
    from scipy.stats import pearsonr
    corr, p_value = pearsonr(df["CommuteDistance"], df["YearlyIncome"])
    print(f"\nPearson Correlation: {corr:.4f}, p-value: {p_value}")
else:
    print("\nCommuteDistance column not found in dataset.")



CommuteDistance column not found in dataset.


In [57]:
if "CommuteDistance" in customers.columns:
    from scipy.stats import pearsonr
    corr,p_value=pearsonr(df["CommuteDistance"],df["YearlyIncome"])
    print(f"Correalation:{corr},p-value:{p_value}")
else:
    print("Commute Distance not found")

Commute Distance not found
