## Processing the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import sklearn
sys.path.append(os.path.join(os.path.abspath(".."), "code"))
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("data/clothing_sustainability.csv")
df

Unnamed: 0,Company,Product_Type,Production_Year,Greenhouse_Gas_Emissions,Pollutants_Emitted,Water_Consumption,Energy_Consumption,Waste_Generation,Sales_Revenue
0,Zara,Polyester,2020,5000,20,7500,1200,300,500000
1,Zara,Nylon,2019,3000,15,5000,900,200,450000
2,Zara,Recycled_Poly,2021,3500,18,6000,1100,250,480000
3,Zara,Cotton,2018,2000,10,4500,800,180,550000
4,Zara,Synthetic_Blend,2022,6000,25,8000,1500,350,600000
...,...,...,...,...,...,...,...,...,...
6951,Adidas,Tencel,2018,3218,21,7536,902,192,523173
6952,Urban Outfitters,Microfiber,2019,2622,11,5267,952,170,436096
6953,Urban Outfitters,Organic_Cotton,2019,5920,24,4321,982,177,489765
6954,Adidas,Microfiber,2020,2897,16,4125,1085,321,487443


In [3]:
# Check for missing values
df.isnull().sum()

Company                     0
Product_Type                0
Production_Year             0
Greenhouse_Gas_Emissions    0
Pollutants_Emitted          0
Water_Consumption           0
Energy_Consumption          0
Waste_Generation            0
Sales_Revenue               0
dtype: int64

In [4]:
features = ['Greenhouse_Gas_Emissions', 'Pollutants_Emitted', 'Water_Consumption', 'Energy_Consumption', 'Waste_Generation']
df_features = df[features]

## Scaling features using MinMaxScaler()

In [5]:
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_features), columns=features)
df_normalized

Unnamed: 0,Greenhouse_Gas_Emissions,Pollutants_Emitted,Water_Consumption,Energy_Consumption,Waste_Generation
0,0.761905,0.6875,0.87500,0.62500,0.750
1,0.285714,0.3750,0.25000,0.25000,0.250
2,0.404762,0.5625,0.50000,0.50000,0.500
3,0.047619,0.0625,0.12500,0.12500,0.150
4,1.000000,1.0000,1.00000,1.00000,1.000
...,...,...,...,...,...
6951,0.337619,0.7500,0.88400,0.25250,0.210
6952,0.195714,0.1250,0.31675,0.31500,0.100
6953,0.980952,0.9375,0.08025,0.35250,0.135
6954,0.261190,0.4375,0.03125,0.48125,0.855


### Note that the higher the rating the more unsustainable

In [6]:
df_normalized['sustainability_rating'] = df_normalized.mean(axis=1)
df_normalized['sustainability_rating']

0       0.739881
1       0.282143
2       0.493452
3       0.102024
4       1.000000
          ...   
6951    0.486824
6952    0.210493
6953    0.497240
6954    0.413238
6955    0.665274
Name: sustainability_rating, Length: 6956, dtype: float64

In [7]:
# Classify the sustainability score into categories
def classify_sustainability(score):
    if score <= 0.25:
        return 'Highly Sustainable'
    elif 0.25 <= score < 0.5:
        return 'Moderately Sustainable'
    else:
        return 'Not Sustainable'

# Apply the classification
df_normalized['sustainability_category'] = df_normalized['sustainability_rating'].apply(classify_sustainability)

In [8]:
df_normalized

Unnamed: 0,Greenhouse_Gas_Emissions,Pollutants_Emitted,Water_Consumption,Energy_Consumption,Waste_Generation,sustainability_rating,sustainability_category
0,0.761905,0.6875,0.87500,0.62500,0.750,0.739881,Not Sustainable
1,0.285714,0.3750,0.25000,0.25000,0.250,0.282143,Moderately Sustainable
2,0.404762,0.5625,0.50000,0.50000,0.500,0.493452,Moderately Sustainable
3,0.047619,0.0625,0.12500,0.12500,0.150,0.102024,Highly Sustainable
4,1.000000,1.0000,1.00000,1.00000,1.000,1.000000,Not Sustainable
...,...,...,...,...,...,...,...
6951,0.337619,0.7500,0.88400,0.25250,0.210,0.486824,Moderately Sustainable
6952,0.195714,0.1250,0.31675,0.31500,0.100,0.210493,Highly Sustainable
6953,0.980952,0.9375,0.08025,0.35250,0.135,0.497240,Moderately Sustainable
6954,0.261190,0.4375,0.03125,0.48125,0.855,0.413238,Moderately Sustainable


In [13]:
dfpt = df[['Company', 'Product_Type']]
combined_df = pd.concat([dfpt.reset_index(drop=True), df_normalized.reset_index(drop=True)], axis=1)
combined_df

Unnamed: 0,Company,Product_Type,Greenhouse_Gas_Emissions,Pollutants_Emitted,Water_Consumption,Energy_Consumption,Waste_Generation,sustainability_rating,sustainability_category
0,Zara,Polyester,0.761905,0.6875,0.87500,0.62500,0.750,0.739881,Not Sustainable
1,Zara,Nylon,0.285714,0.3750,0.25000,0.25000,0.250,0.282143,Moderately Sustainable
2,Zara,Recycled_Poly,0.404762,0.5625,0.50000,0.50000,0.500,0.493452,Moderately Sustainable
3,Zara,Cotton,0.047619,0.0625,0.12500,0.12500,0.150,0.102024,Highly Sustainable
4,Zara,Synthetic_Blend,1.000000,1.0000,1.00000,1.00000,1.000,1.000000,Not Sustainable
...,...,...,...,...,...,...,...,...,...
6951,Adidas,Tencel,0.337619,0.7500,0.88400,0.25250,0.210,0.486824,Moderately Sustainable
6952,Urban Outfitters,Microfiber,0.195714,0.1250,0.31675,0.31500,0.100,0.210493,Highly Sustainable
6953,Urban Outfitters,Organic_Cotton,0.980952,0.9375,0.08025,0.35250,0.135,0.497240,Moderately Sustainable
6954,Adidas,Microfiber,0.261190,0.4375,0.03125,0.48125,0.855,0.413238,Moderately Sustainable


In [14]:
# Save the processed data
combined_df.to_csv('processed_sustainability_ratings.csv', index=False)