In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

### Load Dataset

In [2]:
DF_PATH = "data/lifestyle_sustainability_data.csv"

In [3]:
df_raw = pd.read_csv(DF_PATH)
print(df_raw.shape)
print(df_raw.columns)
print(df_raw.dtypes)
df_raw.head()

(499, 20)
Index(['ParticipantID', 'Age', 'Location', 'DietType', 'LocalFoodFrequency',
       'TransportationMode', 'EnergySource', 'HomeType', 'HomeSize',
       'ClothingFrequency', 'SustainableBrands', 'EnvironmentalAwareness',
       'CommunityInvolvement', 'MonthlyElectricityConsumption',
       'MonthlyWaterConsumption', 'Gender', 'UsingPlasticProducts',
       'DisposalMethods', 'PhysicalActivities', 'Rating'],
      dtype='object')
ParticipantID                     int64
Age                               int64
Location                         object
DietType                         object
LocalFoodFrequency               object
TransportationMode               object
EnergySource                     object
HomeType                         object
HomeSize                          int64
ClothingFrequency                object
SustainableBrands                  bool
EnvironmentalAwareness            int64
CommunityInvolvement             object
MonthlyElectricityConsumption     in

Unnamed: 0,ParticipantID,Age,Location,DietType,LocalFoodFrequency,TransportationMode,EnergySource,HomeType,HomeSize,ClothingFrequency,SustainableBrands,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,Gender,UsingPlasticProducts,DisposalMethods,PhysicalActivities,Rating
0,1,35,Urban,Mostly Plant-Based,Often,Bike,Renewable,Apartment,800,Rarely,True,5,High,100,1500,Female,Rarely,Composting,High,5
1,2,28,Suburban,Balanced,Sometimes,Public Transit,Mixed,House,1500,Sometimes,True,4,Moderate,250,3000,Male,Sometimes,Recycling,Moderate,4
2,3,65,Rural,Mostly Animal-Based,Rarely,Car,Non-Renewable,House,2500,Often,False,2,Low,400,4500,Male,Often,Landfill,Low,1
3,4,42,Urban,Mostly Plant-Based,Often,Walk,Renewable,Apartment,950,Sometimes,True,4,Moderate,150,2000,Female,Rarely,Recycling,High,5
4,5,31,Suburban,Balanced,Sometimes,Public Transit,Mixed,House,1800,Often,True,3,Low,300,3500,Non-Binary,Sometimes,Combination,Moderate,3


In [4]:
columns_of_interest = ['DietType', 'TransportationMode', 'EnvironmentalAwareness', 'CommunityInvolvement', 'MonthlyElectricityConsumption', 
                       'MonthlyWaterConsumption', 'PhysicalActivities','Rating']

In [5]:
df = df_raw.copy()[columns_of_interest]
df

Unnamed: 0,DietType,TransportationMode,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,PhysicalActivities,Rating
0,Mostly Plant-Based,Bike,5,High,100,1500,High,5
1,Balanced,Public Transit,4,Moderate,250,3000,Moderate,4
2,Mostly Animal-Based,Car,2,Low,400,4500,Low,1
3,Mostly Plant-Based,Walk,4,Moderate,150,2000,High,5
4,Balanced,Public Transit,3,Low,300,3500,Moderate,3
...,...,...,...,...,...,...,...,...
494,Mostly Plant-Based,Walk,4,High,150,2000,High,4
495,Balanced,Public Transit,2,Low,400,4500,Low,2
496,Mostly Plant-Based,Bike,5,High,280,3200,Moderate,5
497,Mostly Plant-Based,Walk,1,High,397,4076,Low,5


In [6]:
df.isna().sum()

DietType                           0
TransportationMode                 0
EnvironmentalAwareness             0
CommunityInvolvement             118
MonthlyElectricityConsumption      0
MonthlyWaterConsumption            0
PhysicalActivities               108
Rating                             0
dtype: int64

#### Fill na values

In [7]:
df[["CommunityInvolvement", "PhysicalActivities"]] = df[["CommunityInvolvement", "PhysicalActivities"]].fillna("Moderate")
df.isna().sum()

DietType                         0
TransportationMode               0
EnvironmentalAwareness           0
CommunityInvolvement             0
MonthlyElectricityConsumption    0
MonthlyWaterConsumption          0
PhysicalActivities               0
Rating                           0
dtype: int64

### Variables Encoding

#### Categorical Ordinal Variables

In [8]:
ordinal_variables = ["CommunityInvolvement", "PhysicalActivities"]

ordinal_variables_map = {
    "CommunityInvolvement": {"High": 2, "Moderate": 1, "Low": 0},
    "PhysicalActivities": {"High": 2, "Moderate": 1, "Low": 0}
}

for variable in ordinal_variables:
    df[variable] = df[variable].map(ordinal_variables_map[variable])

#### Categorical Nominal Variables

In [9]:
nominal_variables = ["DietType", "TransportationMode"]

df = pd.get_dummies(df, prefix=nominal_variables, columns=nominal_variables)
rating_column = df.pop("Rating")
df.insert(df.shape[1], "Rating", rating_column)

In [10]:
df

Unnamed: 0,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,PhysicalActivities,DietType_Balanced,DietType_Mostly Animal-Based,DietType_Mostly Plant-Based,TransportationMode_Bike,TransportationMode_Car,TransportationMode_Public Transit,TransportationMode_Walk,Rating
0,5,2,100,1500,2,False,False,True,True,False,False,False,5
1,4,1,250,3000,1,True,False,False,False,False,True,False,4
2,2,0,400,4500,0,False,True,False,False,True,False,False,1
3,4,1,150,2000,2,False,False,True,False,False,False,True,5
4,3,0,300,3500,1,True,False,False,False,False,True,False,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,4,2,150,2000,2,False,False,True,False,False,False,True,4
495,2,0,400,4500,0,True,False,False,False,False,True,False,2
496,5,2,280,3200,1,False,False,True,True,False,False,False,5
497,1,2,397,4076,0,False,False,True,False,False,False,True,5


### Standardization

In [11]:
standard_scaler = StandardScaler()
standard_scaler

In [12]:
df_without_target = df.copy().drop(columns=["Rating"])
data_standardized = standard_scaler.fit_transform(df_without_target)
data_standardized.shape

(499, 12)

In [13]:
df_standardized_columns = list(df.columns)
df_standardized_columns.remove("Rating")
df_standardized = pd.DataFrame(data=data_standardized, columns=df_standardized_columns)
df_standardized.insert(df_standardized.shape[1], "Rating", df["Rating"].to_list())
df_standardized

Unnamed: 0,EnvironmentalAwareness,CommunityInvolvement,MonthlyElectricityConsumption,MonthlyWaterConsumption,PhysicalActivities,DietType_Balanced,DietType_Mostly Animal-Based,DietType_Mostly Plant-Based,TransportationMode_Bike,TransportationMode_Car,TransportationMode_Public Transit,TransportationMode_Walk,Rating
0,1.445313,1.385178,-1.594895,-1.340155,1.347737,-0.680693,-0.636894,1.238142,1.786974,-0.624477,-0.571951,-0.553428,5
1,0.699490,-0.034132,-0.348649,-0.114045,-0.062160,1.469091,-0.636894,-0.807662,-0.559605,-0.624477,1.748402,-0.553428,4
2,-0.792157,-1.453442,0.897597,1.112064,-1.472057,-0.680693,1.570120,-0.807662,-0.559605,1.601339,-0.571951,-0.553428,1
3,0.699490,-0.034132,-1.179480,-0.931452,1.347737,-0.680693,-0.636894,1.238142,-0.559605,-0.624477,-0.571951,1.806919,5
4,-0.046334,-1.453442,0.066766,0.294658,-0.062160,1.469091,-0.636894,-0.807662,-0.559605,-0.624477,1.748402,-0.553428,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0.699490,1.385178,-1.179480,-0.931452,1.347737,-0.680693,-0.636894,1.238142,-0.559605,-0.624477,-0.571951,1.806919,4
495,-0.792157,-1.453442,0.897597,1.112064,-1.472057,1.469091,-0.636894,-0.807662,-0.559605,-0.624477,1.748402,-0.553428,2
496,1.445313,1.385178,-0.099400,0.049436,-0.062160,-0.680693,-0.636894,1.238142,1.786974,-0.624477,-0.571951,-0.553428,5
497,-1.537981,1.385178,0.872672,0.765484,-1.472057,-0.680693,-0.636894,1.238142,-0.559605,-0.624477,-0.571951,1.806919,5


In [14]:
df_standardized.isna().sum()

EnvironmentalAwareness               0
CommunityInvolvement                 0
MonthlyElectricityConsumption        0
MonthlyWaterConsumption              0
PhysicalActivities                   0
DietType_Balanced                    0
DietType_Mostly Animal-Based         0
DietType_Mostly Plant-Based          0
TransportationMode_Bike              0
TransportationMode_Car               0
TransportationMode_Public Transit    0
TransportationMode_Walk              0
Rating                               0
dtype: int64

### Export Dataset

In [16]:
df_standardized.to_csv("data/lifestyle_sustainability_data_standardized.csv", index=False)