In [1]:
import pandas as pd
import numpy as np
data = {
    "Energy Source": ["Soloar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}
energy_df = pd.DataFrame(data)
print("Original Energy Data with Missing Values:")
print(energy_df)

Original Energy Data with Missing Values:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0        Soloar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN


In [2]:
cleaned_df = energy_df.dropna()
print("\nData After Removing Rows with Missing Values:")
print(cleaned_df)


Data After Removing Rows with Missing Values:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0        Soloar                    1200.0             200.0
4       Biomass                    2500.0             250.0


In [4]:
forward_filled_df = energy_df.fillna(method="ffill")
print("\nData After Forward Filling:")
print(forward_filled_df)


Data After Forward Filling:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0        Soloar                    1200.0             200.0
1          Wind                    1200.0             400.0
2    Hydropower                    2900.0             400.0
3    Geothermal                    2900.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0


  forward_filled_df = energy_df.fillna(method="ffill")


In [8]:
energy_df["Energy Consumption (MWh)"].fillna(energy_df["Energy Consumption (MWh)"].mean(), inplace=True)
energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)
print("\nData After Imputing Misssing Values with Mean:")
print(energy_df)


Data After Imputing Misssing Values with Mean:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0        Soloar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Energy Consumption (MWh)"].fillna(energy_df["Energy Consumption (MWh)"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)


In [9]:
energy_df["Missing Consumption"] = energy_df["Energy Consumption (MWh)"].isna().astype(int)
print("\nData with Missing Flagged:")
print(energy_df)


Data with Missing Flagged:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0        Soloar                    1200.0             200.0   
1          Wind                    2450.0             400.0   
2    Hydropower                    2900.0             250.0   
3    Geothermal                    2450.0             150.0   
4       Biomass                    2500.0             250.0   
5       Nuclear                    3200.0             250.0   

   Missing Consumption  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  


In [10]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ------------------------------- -------- 8.7/11.1 MB 48.8 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 43.4 MB/s eta 0:00:00
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.1-cp312-cp312-win_amd64.whl (43.6 MB)
   ---------------------------------------- 0.0/43.6 MB ? eta -:--:--
   ------ --------------------------------- 6.6/43.6 MB 33.6 MB/s eta 0:00:02
   --

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(
    energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]]
)
print("\nData After Normalization (Min-Max Scaling):")
print(energy_df)


Data After Normalization (Min-Max Scaling):
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0        Soloar                     0.000               0.2   
1          Wind                     0.625               1.0   
2    Hydropower                     0.850               0.4   
3    Geothermal                     0.625               0.0   
4       Biomass                     0.650               0.4   
5       Nuclear                     1.000               0.4   

   Missing Consumption  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(
    energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]]
)
print("\nData After Standardization (Z-score Scaling):")
print(energy_df)


Data After Standardization (Z-score Scaling):
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0        Soloar             -2.005893e+00     -6.546537e-01   
1          Wind              3.563181e-16      1.963961e+00   
2    Hydropower              7.221213e-01      1.817029e-16   
3    Geothermal              3.563181e-16     -1.309307e+00   
4       Biomass              8.023570e-02      1.817029e-16   
5       Nuclear              1.203536e+00      1.817029e-16   

   Missing Consumption  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  


In [18]:
energy_encoded_df = pd.get_dummies(energy_df, columns=["Energy Source"])
print("\nData After One-Hot Encoding Categorical Variables:")
print(energy_encoded_df)


Data After One-Hot Encoding Categorical Variables:
   Energy Consumption (MWh)  Cost (Million $)  Missing Consumption  \
0             -2.005893e+00     -6.546537e-01                    0   
1              3.563181e-16      1.963961e+00                    0   
2              7.221213e-01      1.817029e-16                    0   
3              3.563181e-16     -1.309307e+00                    0   
4              8.023570e-02      1.817029e-16                    0   
5              1.203536e+00      1.817029e-16                    0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True                     False       

In [20]:
energy_encoded_df["Consumption per $Million"] = energy_encoded_df["Energy Consumption (MWh)"] / energy_encoded_df["Cost (Million $)"]
print("\nData with New Feature (Consumption per $Million):")
print(energy_encoded_df)


Data with New Feature (Consumption per $Million):
   Energy Consumption (MWh)  Cost (Million $)  Missing Consumption  \
0             -2.005893e+00     -6.546537e-01                    0   
1              3.563181e-16      1.963961e+00                    0   
2              7.221213e-01      1.817029e-16                    0   
3              3.563181e-16     -1.309307e+00                    0   
4              8.023570e-02      1.817029e-16                    0   
5              1.203536e+00      1.817029e-16                    0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True                     False        