#  **AI-Based Predictive Modeling of Individual Carbon Footprints Using Behavioral and Energy Consumption Data**

### 1. Data cleaning and preprocessing

#### Dataset : https://www.kaggle.com/datasets/dumanmesut/individual-carbon-footprint-calculation?resource=download

In [2]:
import pandas as pd
import numpy as np
import ast

In [4]:
data=pd.read_csv("Carbon Emission.csv")

In [5]:
data.columns


Index(['Body Type', 'Sex', 'Diet', 'How Often Shower', 'Heating Energy Source',
       'Transport', 'Vehicle Type', 'Social Activity', 'Monthly Grocery Bill',
       'Frequency of Traveling by Air', 'Vehicle Monthly Distance Km',
       'Waste Bag Size', 'Waste Bag Weekly Count', 'How Long TV PC Daily Hour',
       'How Many New Clothes Monthly', 'How Long Internet Daily Hour',
       'Energy efficiency', 'Recycling', 'Cooking_With', 'CarbonEmission'],
      dtype='object')

In [6]:
data.head(5)

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


In [7]:
data.tail()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
9995,obese,male,omnivore,twice a day,coal,private,hybrid,sometimes,230,never,268,medium,5,12,27,9,Yes,[],['Microwave'],2408
9996,normal,female,vegan,twice a day,coal,private,lpg,never,234,frequently,5316,extra large,3,14,8,24,Sometimes,"['Paper', 'Plastic']","['Stove', 'Microwave']",3084
9997,overweight,female,vegetarian,daily,electricity,walk/bicycle,,sometimes,298,very frequently,96,extra large,5,11,5,24,Yes,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']",2377
9998,underweight,male,vegan,more frequently,coal,private,petrol,often,179,rarely,8688,medium,5,19,14,5,Sometimes,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']",4574
9999,obese,male,pescatarian,twice a day,wood,private,electric,sometimes,115,never,9952,small,4,11,6,0,Sometimes,"['Plastic', 'Glass', 'Metal']","['Oven', 'Grill', 'Airfryer']",826


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Body Type                      10000 non-null  object
 1   Sex                            10000 non-null  object
 2   Diet                           10000 non-null  object
 3   How Often Shower               10000 non-null  object
 4   Heating Energy Source          10000 non-null  object
 5   Transport                      10000 non-null  object
 6   Vehicle Type                   3279 non-null   object
 7   Social Activity                10000 non-null  object
 8   Monthly Grocery Bill           10000 non-null  int64 
 9   Frequency of Traveling by Air  10000 non-null  object
 10  Vehicle Monthly Distance Km    10000 non-null  int64 
 11  Waste Bag Size                 10000 non-null  object
 12  Waste Bag Weekly Count         10000 non-null  int64 
 13  Ho

In [9]:
data.describe()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,CarbonEmission
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,173.8752,2031.4859,4.0246,12.1392,25.109,11.8891,2269.1473
std,72.234018,2769.715597,1.990375,7.106369,14.698725,7.277218,1017.675247
min,50.0,0.0,1.0,0.0,0.0,0.0,306.0
25%,111.0,69.0,2.0,6.0,13.0,6.0,1538.0
50%,173.0,823.0,4.0,12.0,25.0,12.0,2080.0
75%,237.0,2516.75,6.0,18.0,38.0,18.0,2768.0
max,299.0,9999.0,7.0,24.0,50.0,24.0,8377.0


In [10]:
print(data.isna().sum())

Body Type                           0
Sex                                 0
Diet                                0
How Often Shower                    0
Heating Energy Source               0
Transport                           0
Vehicle Type                     6721
Social Activity                     0
Monthly Grocery Bill                0
Frequency of Traveling by Air       0
Vehicle Monthly Distance Km         0
Waste Bag Size                      0
Waste Bag Weekly Count              0
How Long TV PC Daily Hour           0
How Many New Clothes Monthly        0
How Long Internet Daily Hour        0
Energy efficiency                   0
Recycling                           0
Cooking_With                        0
CarbonEmission                      0
dtype: int64


In [11]:
# Replace Blank with NaN value
data=data.replace(r'^\s*$', np.nan, regex=True)

In [12]:
data['Vehicle Type'].unique()

array([nan, 'petrol', 'diesel', 'hybrid', 'lpg', 'electric'], dtype=object)

In [13]:
# Vehicle type has missing data so, fill with None

data['Vehicle Type']=data['Vehicle Type'].fillna('None')

In [14]:
# Another Time check each columns

print("\nMissing values per column:")
print(data.isnull().sum())


Missing values per column:
Body Type                        0
Sex                              0
Diet                             0
How Often Shower                 0
Heating Energy Source            0
Transport                        0
Vehicle Type                     0
Social Activity                  0
Monthly Grocery Bill             0
Frequency of Traveling by Air    0
Vehicle Monthly Distance Km      0
Waste Bag Size                   0
Waste Bag Weekly Count           0
How Long TV PC Daily Hour        0
How Many New Clothes Monthly     0
How Long Internet Daily Hour     0
Energy efficiency                0
Recycling                        0
Cooking_With                     0
CarbonEmission                   0
dtype: int64


#### Clean Categorical Columns

To Avoid problems like 'Male','male','M' → all become 'male'.
'Petrol', 'petrol ', ' PETROL' → all become 'petrol'

In [15]:
# Clean categorical columns

def clean_text(x):
    return x.strip().lower() if isinstance(x, str) else x

categorical_cols = [
    "Body Type","Sex","Diet","How Often Shower","Heating Energy Source",
    "Transport","Vehicle Type","Social Activity","Waste Bag Size",
    "Frequency of Traveling by Air","Energy efficiency"
]

for column in categorical_cols:
  data[column] = data[column].apply(clean_text)

In [16]:
for column in categorical_cols:
  print(data[column].unique())

['overweight' 'obese' 'underweight' 'normal']
['female' 'male']
['pescatarian' 'vegetarian' 'omnivore' 'vegan']
['daily' 'less frequently' 'more frequently' 'twice a day']
['coal' 'natural gas' 'wood' 'electricity']
['public' 'walk/bicycle' 'private']
['none' 'petrol' 'diesel' 'hybrid' 'lpg' 'electric']
['often' 'never' 'sometimes']
['large' 'extra large' 'small' 'medium']
['frequently' 'rarely' 'never' 'very frequently']
['no' 'sometimes' 'yes']


#### The columns ‘Recycling’ and ‘Cooking_With’ contained values stored as string representations of lists

These steps transformed text-based list values into clean, interpretable 0/1 features, making the data suitable for machine learning algorithms.

In [17]:
list_columns_to_process = ['Recycling', 'Cooking_With']

In [18]:
for column in list_columns_to_process:
  print(data[column].unique() )

["['Metal']" "['Paper', 'Plastic', 'Glass', 'Metal']" "['Paper']"
 "['Paper', 'Glass', 'Metal']" '[]' "['Paper', 'Plastic', 'Glass']"
 "['Glass']" "['Paper', 'Plastic']" "['Plastic']"
 "['Plastic', 'Glass', 'Metal']" "['Paper', 'Plastic', 'Metal']"
 "['Paper', 'Glass']" "['Paper', 'Metal']" "['Glass', 'Metal']"
 "['Plastic', 'Glass']" "['Plastic', 'Metal']"]
["['Stove', 'Oven']" "['Stove', 'Microwave']" "['Oven', 'Microwave']"
 "['Microwave', 'Grill', 'Airfryer']" "['Oven']"
 "['Stove', 'Oven', 'Microwave']" "['Grill', 'Airfryer']" "['Stove']"
 "['Stove', 'Oven', 'Microwave', 'Grill', 'Airfryer']"
 "['Oven', 'Microwave', 'Grill', 'Airfryer']"
 "['Stove', 'Grill', 'Airfryer']" "['Oven', 'Grill', 'Airfryer']"
 "['Microwave']" "['Stove', 'Oven', 'Grill', 'Airfryer']"
 "['Stove', 'Microwave', 'Grill', 'Airfryer']" '[]']


In [19]:
# Convert String list columns

def parse_list(x):
    try:
        v = ast.literal_eval(x) if isinstance(x, str) else x
        return v if isinstance(v, list) else []
    except:
        return []

In [20]:
# Filter for columns that actually exist in the DataFrame

for c in list_columns_to_process:
    data[c] = data[c].apply(parse_list)

for c in list_columns_to_process:
    items = sorted({item for sub in data[c] for item in sub})
    for it in items:
        data[f"{c}_{it.lower()}"] = data[c].apply(lambda xs, it=it: 1 if it in xs else 0)

data = data.drop(columns=list_columns_to_process)

In [21]:
# Numerical Columns
num_cols = [
    "Monthly Grocery Bill","Vehicle Monthly Distance Km",
    "How Many New Clothes Monthly","How Long TV PC Daily Hour",
    "How Long Internet Daily Hour","Waste Bag Weekly Count"
]

In [22]:
for c in num_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce")
    data[c] = data[c].fillna(data[c].median())
     # If a value cannot be converted to a number dont throw an error instead,set it as NaN

In [23]:
# Outlier Handling
for c in num_cols + ["CarbonEmission"]:
    q1, q3 = data[c].quantile([0.25, 0.75])
    iqr = q3 - q1
    low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
    data[c] = data[c].clip(low, high)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 46 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Monthly Grocery Bill                           10000 non-null  int64  
 1   Vehicle Monthly Distance Km                    10000 non-null  float64
 2   Waste Bag Weekly Count                         10000 non-null  int64  
 3   How Long TV PC Daily Hour                      10000 non-null  int64  
 4   How Many New Clothes Monthly                   10000 non-null  int64  
 5   How Long Internet Daily Hour                   10000 non-null  int64  
 6   CarbonEmission                                 10000 non-null  int64  
 7   Recycling_glass                                10000 non-null  int64  
 8   Recycling_metal                                10000 non-null  int64  
 9   Recycling_paper                                1000

In [31]:
# Check for existing columns
valid_columns = [col for col in categorical_cols if col in data.columns]

# Print valid columns to debug
print(valid_columns)

# One-Hot Encoding for valid categorical columns
data = pd.get_dummies(data, columns=valid_columns, drop_first=True)



[]


In [32]:
# Final Dataset

print(f"Data Shapes: {data.shape}")
data.head()

Data Shapes: (10000, 46)


Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,CarbonEmission,Recycling_glass,Recycling_metal,Recycling_paper,...,Social Activity_often,Social Activity_sometimes,Waste Bag Size_large,Waste Bag Size_medium,Waste Bag Size_small,Frequency of Traveling by Air_never,Frequency of Traveling by Air_rarely,Frequency of Traveling by Air_very frequently,Energy efficiency_sometimes,Energy efficiency_yes
0,230,210.0,4,7,26,1,2238,0,1,0,...,True,False,True,False,False,False,False,False,False,False
1,114,9.0,3,9,38,5,1892,0,1,0,...,True,False,False,False,False,False,True,False,False,False
2,138,2472.0,1,14,47,6,2595,0,1,0,...,False,False,False,False,True,True,False,False,True,False
3,157,74.0,3,20,5,7,1074,1,1,1,...,False,True,False,True,False,False,True,False,True,False
4,266,6188.375,1,3,5,6,4613,0,0,1,...,True,False,True,False,False,False,False,True,False,True


In [33]:
data.describe()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,CarbonEmission,Recycling_glass,Recycling_metal,Recycling_paper,Recycling_plastic,Cooking_With_airfryer,Cooking_With_grill,Cooking_With_microwave,Cooking_With_oven,Cooking_With_stove
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,173.8752,1777.613225,4.0246,12.1392,25.109,11.8891,2245.7222,0.4979,0.5047,0.4977,0.4997,0.4992,0.4992,0.5073,0.505,0.5041
std,72.234018,2200.67765,1.990375,7.106369,14.698725,7.277218,947.1203,0.500021,0.500003,0.50002,0.500025,0.500024,0.500024,0.499972,0.5,0.500008
min,50.0,0.0,1.0,0.0,0.0,0.0,306.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,111.0,69.0,2.0,6.0,13.0,6.0,1538.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,173.0,823.0,4.0,12.0,25.0,12.0,2080.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
75%,237.0,2516.75,6.0,18.0,38.0,18.0,2768.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,299.0,6188.375,7.0,24.0,50.0,24.0,4613.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Model Train