# Preprocessing   

import statements

In [62]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

loading the data

In [63]:
# Placeholder for loading data from CSV (uncomment the actual loading line in use)
df = pd.read_csv('shopping_trends_2.0.csv')

df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,Venmo,Weekly
4,5,45,Male,Blouse,Clothing,49,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


Cleaning Data

In [64]:

# Automatically detect column types
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
text_columns = df.select_dtypes(include=['object']).columns.tolist()

# Remove rows with any null values and duplicates
df_clean = df.dropna()
df_clean = df.dropna().drop_duplicates()

df_clean.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,Venmo,Weekly
4,5,45,Male,Blouse,Clothing,49,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


Normalizing & One Hot Encoding

In [65]:
# Normalize numerical features
scaler = MinMaxScaler()
df_numerical_scaled = pd.DataFrame(scaler.fit_transform(df_clean[numerical_columns]), columns=numerical_columns, index=df_clean.index)

# Initialize OneHotEncoder
onehot = OneHotEncoder(sparse_output=False)

df_onehot_encoded = pd.DataFrame(index=df_clean.index)

for column in text_columns:
    # Reshape the data because the function expects a 2D array and our data is in 1D (series)
    reshaped_data = df_clean[column].fillna('missing').values.reshape(-1, 1)
    # Fit and transform the data to one-hot encoding
    onehot_matrix = onehot.fit_transform(reshaped_data)
    # Create a DataFrame from the one-hot encoded matrix with column names derived from the encoder
    onehot_df = pd.DataFrame(onehot_matrix, columns=[f"{column}_{feat}" for feat in onehot.get_feature_names_out()], index=df_clean.index)
    # Concatenate the new DataFrame to the existing one
    df_onehot_encoded = pd.concat([df_onehot_encoded, onehot_df], axis=1)


df_processed = pd.concat([df_clean.drop(numerical_columns + text_columns, axis=1), df_numerical_scaled, df_onehot_encoded], axis=1)

df_processed

Unnamed: 0,Customer ID,Age,Purchase Amount (USD),Review Rating,Previous Purchases,Gender_x0_Female,Gender_x0_Male,Item Purchased_x0_Backpack,Item Purchased_x0_Belt,Item Purchased_x0_Blouse,...,Payment Method_x0_Debit Card,Payment Method_x0_PayPal,Payment Method_x0_Venmo,Frequency of Purchases_x0_Annually,Frequency of Purchases_x0_Bi-Weekly,Frequency of Purchases_x0_Every 3 Months,Frequency of Purchases_x0_Fortnightly,Frequency of Purchases_x0_Monthly,Frequency of Purchases_x0_Quarterly,Frequency of Purchases_x0_Weekly
0,0.000000,0.711538,0.4125,0.24,0.265306,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.000256,0.019231,0.5500,0.24,0.020408,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000513,0.615385,0.6625,0.24,0.448980,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.000769,0.057692,0.8750,0.40,0.979592,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.001026,0.519231,0.3625,0.08,0.612245,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,0.998974,0.423077,0.1000,0.68,0.632653,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3896,0.999231,0.653846,0.3625,0.80,0.816327,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3897,0.999487,0.538462,0.1625,0.16,0.469388,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3898,0.999744,0.500000,0.7125,0.52,0.469388,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
df_processed.describe

<bound method NDFrame.describe of       Customer ID       Age  Purchase Amount (USD)  Review Rating  \
0        0.000000  0.711538                 0.4125           0.24   
1        0.000256  0.019231                 0.5500           0.24   
2        0.000513  0.615385                 0.6625           0.24   
3        0.000769  0.057692                 0.8750           0.40   
4        0.001026  0.519231                 0.3625           0.08   
...           ...       ...                    ...            ...   
3895     0.998974  0.423077                 0.1000           0.68   
3896     0.999231  0.653846                 0.3625           0.80   
3897     0.999487  0.538462                 0.1625           0.16   
3898     0.999744  0.500000                 0.7125           0.52   
3899     1.000000  0.653846                 0.7625           0.24   

      Previous Purchases  Gender_x0_Female  Gender_x0_Male  \
0               0.265306               0.0             1.0   
1            

Rounding

In [67]:
df_processed = df_processed.round(4)
df_processed.to_csv('test.csv', index = False)

In [68]:

print("the number of dimensions in the dataset:", (len(df_processed.columns[:-1])))

the number of dimensions in the dataset: 93


PCA

In [69]:
from sklearn.decomposition import PCA

# Choose the number of components or the variance ratio you want to keep
n_components = 0.85   # For example, keep 95% of variance

# Initialize PCA
pca = PCA(n_components=n_components)

# Apply PCA to the DataFrame
# Note: Make sure df_processed_rounded does not include non-feature columns like IDs or labels
df_pca_transformed = pca.fit_transform(df_processed)

# Create a new DataFrame for the PCA-transformed data
df_pca = pd.DataFrame(df_pca_transformed, columns=[f"PCA_Component_{i}" for i in range(df_pca_transformed.shape[1])])

# Optionally, round the PCA components to 4 decimal places if needed
df_pca_rounded = df_pca.round(4)

df_pca_rounded


Unnamed: 0,PCA_Component_0,PCA_Component_1,PCA_Component_2,PCA_Component_3,PCA_Component_4,PCA_Component_5,PCA_Component_6,PCA_Component_7,PCA_Component_8,PCA_Component_9,...,PCA_Component_33,PCA_Component_34,PCA_Component_35,PCA_Component_36,PCA_Component_37,PCA_Component_38,PCA_Component_39,PCA_Component_40,PCA_Component_41,PCA_Component_42
0,1.6172,-0.7590,0.6946,-0.2147,-0.3729,-0.5203,-0.5710,0.0411,-0.2983,0.2417,...,-0.2749,0.2323,0.1971,-0.2648,-0.2857,-0.4021,-0.3300,0.0417,0.0541,0.2925
1,1.6181,-0.7961,0.6944,-0.1368,-0.3473,-0.5018,-0.6021,0.0037,-0.1079,0.3198,...,0.1918,0.2863,-0.1296,-0.3302,0.2564,-0.3489,0.5334,-0.0585,-0.1219,0.1764
2,1.6128,-0.7103,0.1853,-0.1924,0.8036,0.2746,-0.2849,-0.3535,0.8461,0.4064,...,0.0097,0.1891,0.0525,-0.1721,0.2048,-0.2238,0.2146,-0.0455,0.0211,-0.1817
3,1.6228,0.2048,-0.6772,-0.4541,0.8627,-0.0167,-0.0968,0.9116,-0.0227,-0.5326,...,-0.0464,0.2410,-0.1129,-0.0728,0.0547,-0.3124,0.1971,-0.1075,0.2224,-0.2751
4,1.5873,-0.6531,-0.7695,-0.2595,0.7231,0.0754,-0.2661,-0.1996,-0.1554,1.0883,...,-0.3935,0.0074,-0.0199,-0.2468,-0.3664,-0.2380,-0.1337,0.2304,0.1720,0.1199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,-1.3184,-0.6695,0.6207,-0.6657,-0.0880,-0.3766,0.6075,-0.5169,-0.3414,-0.2237,...,-0.0514,-0.0985,-0.2322,-0.0542,-0.0149,-0.0859,0.0616,0.0902,-0.0560,-0.0999
3896,-1.3077,0.7166,0.7171,-0.4092,0.8114,0.0119,-0.3245,-0.5769,-0.4611,-0.5277,...,0.1235,-0.0677,-0.0238,0.1389,-0.0487,-0.0199,0.1028,-0.0168,-0.0670,0.0494
3897,-1.3079,0.7442,0.7434,-0.4898,0.6767,0.1119,-0.3094,-0.4836,-0.6382,-0.3366,...,-0.1536,-0.1017,-0.4608,-0.0734,0.1119,-0.0579,-0.1559,-0.4453,-0.4197,-0.4275
3898,-1.3046,0.2723,0.3092,-0.7238,0.1901,-0.3604,0.8429,0.5127,1.0663,0.2048,...,0.1056,0.0674,-0.0158,-0.1412,-0.0280,-0.1975,0.0397,-0.0479,-0.0633,-0.1003


Saving Preprocessed Data

In [70]:
df_pca_rounded.to_csv('final_preprocessed.csv', index = False)