# Select Dataset
* Wholesale customers: https://archive.ics.uci.edu/dataset/292/wholesale+customers
* The data set refers to clients of a wholesale distributor. It includes the annual spending in monetary units (m.u.) on diverse product categories.

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wholesale_customers = fetch_ucirepo(id=292) 
  
# data (as pandas dataframes) 
X = wholesale_customers.data.features 
y = wholesale_customers.data.targets 
  
# metadata 
print(wholesale_customers.metadata) 
  
# variable information 
print(wholesale_customers.variables) 

{'uci_id': 292, 'name': 'Wholesale customers', 'repository_url': 'https://archive.ics.uci.edu/dataset/292/wholesale+customers', 'data_url': 'https://archive.ics.uci.edu/static/public/292/data.csv', 'abstract': 'The data set refers to clients of a wholesale distributor. It includes the annual spending in monetary units (m.u.) on diverse product categories', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 440, 'num_features': 7, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Region'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2013, 'last_updated': 'Mon Feb 05 2024', 'dataset_doi': '10.24432/C5030X', 'creators': ['Margarida Cardoso'], 'intro_paper': None, 'additional_info': {'summary': None, 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description':

In [6]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Concatenate features (X) and target (y) into a DataFrame
df = pd.concat([X, y], axis=1)
print(df)

     Channel  Fresh   Milk  Grocery  Frozen  Detergents_Paper  Delicassen  \
0          2  12669   9656     7561     214              2674        1338   
1          2   7057   9810     9568    1762              3293        1776   
2          2   6353   8808     7684    2405              3516        7844   
3          1  13265   1196     4221    6404               507        1788   
4          2  22615   5410     7198    3915              1777        5185   
..       ...    ...    ...      ...     ...               ...         ...   
435        1  29703  12051    16027   13135               182        2204   
436        1  39228   1431      764    4510                93        2346   
437        2  14531  15488    30243     437             14841        1867   
438        1  10290   1981     2232    1038               168        2125   
439        1   2787   1698     2510      65               477          52   

     Region  
0         3  
1         3  
2         3  
3         3  
4    

In [None]:
##

In [39]:
# Perform one-hot encoding
from sklearn.preprocessing import OneHotEncoder

# Identify the categorical columns
categorical_columns = ['Channel', 'Region']

# Perform one-hot encoding using pandas
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Display the encoded DataFrame
print(df_encoded.head())

   Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  Channel_1  \
0  12669  9656     7561     214              2674        1338      False   
1   7057  9810     9568    1762              3293        1776      False   
2   6353  8808     7684    2405              3516        7844      False   
3  13265  1196     4221    6404               507        1788       True   
4  22615  5410     7198    3915              1777        5185      False   

   Channel_2  Region_1  Region_2  Region_3  
0       True     False     False      True  
1       True     False     False      True  
2       True     False     False      True  
3      False     False     False      True  
4       True     False     False      True  


# Exploratory Data Analysis

## Descriptive Statistics

In [13]:
df.describe()

Unnamed: 0,Channel,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Region
count,440.0,440.0,440.0,440.0,440.0,440.0,440.0,440.0
mean,1.322727,12000.297727,5796.265909,7951.277273,3071.931818,2881.493182,1524.870455,2.543182
std,0.468052,12647.328865,7380.377175,9503.162829,4854.673333,4767.854448,2820.105937,0.774272
min,1.0,3.0,55.0,3.0,25.0,3.0,3.0,1.0
25%,1.0,3127.75,1533.0,2153.0,742.25,256.75,408.25,2.0
50%,1.0,8504.0,3627.0,4755.5,1526.0,816.5,965.5,3.0
75%,2.0,16933.75,7190.25,10655.75,3554.25,3922.0,1820.25,3.0
max,2.0,112151.0,73498.0,92780.0,60869.0,40827.0,47943.0,3.0


In [22]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=['Region']))  # Exclude the target variable
scaled_df = pd.DataFrame(scaled_features, columns=df.drop(columns=['Region']).columns)

# Concatenate scaled features and target
final_df = pd.concat([scaled_df, df['Region']], axis=1)

print(final_df)

      Channel     Fresh      Milk   Grocery    Frozen  Detergents_Paper  \
0    1.448652  0.052933  0.523568 -0.041115 -0.589367         -0.043569   
1    1.448652 -0.391302  0.544458  0.170318 -0.270136          0.086407   
2    1.448652 -0.447029  0.408538 -0.028157 -0.137536          0.133232   
3   -0.690297  0.100111 -0.624020 -0.392977  0.687144         -0.498588   
4    1.448652  0.840239 -0.052396 -0.079356  0.173859         -0.231918   
..        ...       ...       ...       ...       ...               ...   
435 -0.690297  1.401312  0.848446  0.850760  2.075222         -0.566831   
436 -0.690297  2.155293 -0.592142 -0.757165  0.296561         -0.585519   
437  1.448652  0.200326  1.314671  2.348386 -0.543380          2.511218   
438 -0.690297 -0.135384 -0.517536 -0.602514 -0.419441         -0.569770   
439 -0.690297 -0.729307 -0.555924 -0.573227 -0.620094         -0.504888   

     Delicassen  Region  
0     -0.066339       3  
1      0.089151       3  
2      2.243293      

In [24]:
final_df.describe()

Unnamed: 0,Channel,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Region
count,440.0,440.0,440.0,440.0,440.0,440.0,440.0,440.0
mean,1.61487e-17,-3.4315980000000005e-17,0.0,-4.037175e-17,3.633457e-17,2.422305e-17,-8.074349e-18,2.543182
std,1.001138,1.001138,1.001138,1.001138,1.001138,1.001138,1.001138,0.774272
min,-0.6902971,-0.9496831,-0.778795,-0.8373344,-0.628343,-0.6044165,-0.5402644,1.0
25%,-0.6902971,-0.7023339,-0.578306,-0.6108364,-0.4804306,-0.5511349,-0.3964005,2.0
50%,-0.6902971,-0.2767602,-0.294258,-0.3366684,-0.3188045,-0.4336004,-0.1985766,3.0
75%,1.448652,0.3905226,0.189092,0.2849105,0.09946441,0.2184822,0.1048598,3.0
max,1.448652,7.927738,9.18365,8.936528,11.919,7.967672,16.47845,3.0


In [None]:
sns.scatterplot(x='Gene One',y='Gene Two',hue='Cancer Present',data=df,alpha=0.7)