# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---
## 2. Load data

In [30]:
df = pd.read_csv('/content/drive/MyDrive/Data Science Projects/BCG Junior Data Scientist Project (Customer Churn @ PowerCo)/Feature Engineering/clean_data_after_eda.csv')


In [4]:
df.head(3)

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,var_6m_price_off_peak_var,var_6m_price_peak_var,var_6m_price_mid_peak_var,var_6m_price_off_peak_fix,var_6m_price_peak_fix,var_6m_price_mid_peak_fix,var_6m_price_off_peak,var_6m_price_peak,var_6m_price_mid_peak,churn
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,0.000131,4.100838e-05,0.000908,2.086294,99.530517,44.235794,2.086425,99.53056,44.236702,1
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,3e-06,0.001217891,0.0,0.009482,0.0,0.0,0.009485,0.001217891,0.0,0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,4e-06,9.45015e-08,0.0,0.0,0.0,0.0,4e-06,9.45015e-08,0.0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              14606 non-null  object 
 1   channel_sales                   14606 non-null  object 
 2   cons_12m                        14606 non-null  int64  
 3   cons_gas_12m                    14606 non-null  int64  
 4   cons_last_month                 14606 non-null  int64  
 5   date_activ                      14606 non-null  object 
 6   date_end                        14606 non-null  object 
 7   date_modif_prod                 14606 non-null  object 
 8   date_renewal                    14606 non-null  object 
 9   forecast_cons_12m               14606 non-null  float64
 10  forecast_cons_year              14606 non-null  int64  
 11  forecast_discount_energy        14606 non-null  float64
 12  forecast_meter_rent_12m         

In [31]:
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              14606 non-null  object        
 1   channel_sales                   14606 non-null  object        
 2   cons_12m                        14606 non-null  int64         
 3   cons_gas_12m                    14606 non-null  int64         
 4   cons_last_month                 14606 non-null  int64         
 5   date_activ                      14606 non-null  datetime64[ns]
 6   date_end                        14606 non-null  datetime64[ns]
 7   date_modif_prod                 14606 non-null  datetime64[ns]
 8   date_renewal                    14606 non-null  datetime64[ns]
 9   forecast_cons_12m               14606 non-null  float64       
 10  forecast_cons_year              14606 non-null  int64         
 11  fo

---

## 3. Feature engineering

### Difference between off-peak prices in December and preceding January

Below is the code created by your colleague to calculate the feature described above. Use this code to re-create this feature and then think about ways to build on this feature to create features with a higher predictive power.

In [37]:
price_df = pd.read_csv('/content/drive/MyDrive/Data Science Projects/BCG Junior Data Scientist Project (Customer Churn @ PowerCo)/Datasets/price_data.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head()

Unnamed: 0,id,price_date,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0
3,038af19179925da21a25619c5a24b745,2015-04-01,0.149626,0.0,0.0,44.266931,0.0,0.0
4,038af19179925da21a25619c5a24b745,2015-05-01,0.149626,0.0,0.0,44.266931,0.0,0.0


In [53]:
# Group off-peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_off_peak_var': 'mean', 'price_off_peak_fix': 'mean'}).reset_index()

monthly_price_by_id


Unnamed: 0,id,price_date,price_off_peak_var,price_off_peak_fix
0,0002203ffbb812588b632b9e628cc38d,2015-01-01,0.126098,40.565969
1,0002203ffbb812588b632b9e628cc38d,2015-02-01,0.126098,40.565969
2,0002203ffbb812588b632b9e628cc38d,2015-03-01,0.128067,40.728885
3,0002203ffbb812588b632b9e628cc38d,2015-04-01,0.128067,40.728885
4,0002203ffbb812588b632b9e628cc38d,2015-05-01,0.128067,40.728885
...,...,...,...,...
192997,ffff7fa066f1fb305ae285bb03bf325a,2015-08-01,0.119916,40.728885
192998,ffff7fa066f1fb305ae285bb03bf325a,2015-09-01,0.119916,40.728885
192999,ffff7fa066f1fb305ae285bb03bf325a,2015-10-01,0.119916,40.728885
193000,ffff7fa066f1fb305ae285bb03bf325a,2015-11-01,0.119916,40.728885


In [54]:
# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()



In [55]:
# Calculate the difference
diff = pd.merge(dec_prices.rename(columns={'price_off_peak_var': 'dec_1', 'price_off_peak_fix': 'dec_2'}), jan_prices.drop(columns='price_date').rename(columns={'price_off_peak_var': 'jan_1', 'price_off_peak_fix': 'jan_2'}), on='id')
diff

Unnamed: 0,id,price_date,dec_1,dec_2,jan_1,jan_2
0,0002203ffbb812588b632b9e628cc38d,2015-12-01,0.119906,40.728885,0.126098,40.565969
1,0004351ebdd665e6ee664792efc4fd13,2015-12-01,0.143943,44.444710,0.148047,44.266931
2,0010bcc39e42b3c2131ed2ce55246e3c,2015-12-01,0.201280,45.944710,0.150837,44.444710
3,0010ee3855fdea87602a5b7aba8e42de,2015-12-01,0.113068,40.728885,0.123086,40.565969
4,00114d74e963e47177db89bc70108537,2015-12-01,0.145440,44.266930,0.149434,44.266931
...,...,...,...,...,...,...
16091,ffef185810e44254c3a4c6395e6b4d8a,2015-12-01,0.112488,40.728885,0.162720,41.063970
16092,fffac626da707b1b5ab11e8431a4d0a2,2015-12-01,0.145047,44.444710,0.148825,44.266931
16093,fffc0cacd305dd51f316424bbb08d1bd,2015-12-01,0.151399,41.228885,0.153159,41.063970
16094,fffe4f5646aa39c7f97f95ae2679ce64,2015-12-01,0.118175,40.728885,0.127566,40.565969


In [56]:
diff['offpeak_diff_dec_january_energy'] = diff['dec_1'] - diff['jan_1']
diff['offpeak_diff_dec_january_power'] = diff['dec_2'] - diff['jan_2']

diff = diff[['id', 'offpeak_diff_dec_january_energy','offpeak_diff_dec_january_power']]

In [57]:
diff

Unnamed: 0,id,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.500000
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916
4,00114d74e963e47177db89bc70108537,-0.003994,-0.000001
...,...,...,...
16091,ffef185810e44254c3a4c6395e6b4d8a,-0.050232,-0.335085
16092,fffac626da707b1b5ab11e8431a4d0a2,-0.003778,0.177779
16093,fffc0cacd305dd51f316424bbb08d1bd,-0.001760,0.164916
16094,fffe4f5646aa39c7f97f95ae2679ce64,-0.009391,0.162916


Now it is time to get creative and to conduct some of your own feature engineering! Have fun with it, explore different ideas and try to create as many as yo can!

In [68]:
# Group off-peak prices by companies and month
monthly_price_by_id_2 = price_df.groupby(['id', 'price_date']).agg({'price_peak_var': 'mean', 'price_peak_fix': 'mean'}).reset_index()
# Get january and december prices
jan_prices_2 = monthly_price_by_id_2.groupby('id').first().reset_index()
dec_prices_2 = monthly_price_by_id_2.groupby('id').last().reset_index()
# Calculate the difference
diff_2 = pd.merge(dec_prices_2.rename(columns={'price_peak_var':'dec_1_peak', 'price_peak_fix':'dec_2_peak'}), jan_prices_2.drop(columns='price_date').rename(columns={'price_peak_var': 'jan_1_peak', 'price_peak_fix': 'jan_2_peak'}), on='id')
diff_2['peak_diff_dec_january_energy'] = diff_2['dec_1_peak'] - diff_2['jan_1_peak']
diff_2['peak_diff_dec_january_power'] = diff_2['dec_2_peak'] - diff_2['jan_2_peak']


diff_2 = diff_2[['peak_diff_dec_january_energy','peak_diff_dec_january_power']]

In [69]:
diff_2

Unnamed: 0,peak_diff_dec_january_energy,peak_diff_dec_january_power
0,-0.002302,0.097749
1,0.000000,0.000000
2,0.000000,0.000000
3,-0.005120,0.097749
4,0.000000,0.000000
...,...,...
16091,-0.038788,-0.400251
16092,0.000000,0.000000
16093,-0.003707,0.099749
16094,-0.004937,0.097749


In [70]:
# Group off-peak prices by companies and month
monthly_price_by_id_3 = price_df.groupby(['id', 'price_date']).agg({'price_mid_peak_var': 'mean', 'price_mid_peak_fix': 'mean'}).reset_index()
# Get january and december prices
jan_prices_3 = monthly_price_by_id_3.groupby('id').first().reset_index()
dec_prices_3 = monthly_price_by_id_3.groupby('id').last().reset_index()
# Calculate the difference
diff_3 = pd.merge(dec_prices_3.rename(columns={'price_mid_peak_var':'dec_1_midpeak', 'price_mid_peak_fix':'dec_2_midpeak'}), jan_prices_3.drop(columns='price_date').rename(columns={'price_mid_peak_var': 'jan_1_midpeak', 'price_mid_peak_fix': 'jan_2_midpeak'}), on='id')
diff_3['midpeak_diff_dec_january_energy'] = diff_3['dec_1_midpeak'] - diff_3['jan_1_midpeak']
diff_3['midpeak_diff_dec_january_power'] = diff_3['dec_2_midpeak'] - diff_3['jan_2_midpeak']


diff_3 = diff_3[['midpeak_diff_dec_january_energy','midpeak_diff_dec_january_power']]

diff_3

Unnamed: 0,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power
0,0.003487,0.065166
1,0.000000,0.000000
2,0.000000,0.000000
3,0.000763,0.065166
4,0.000000,0.000000
...,...,...
16091,-0.022735,-0.432834
16092,0.000000,0.000000
16093,-0.007326,0.067166
16094,0.001029,0.065166


In [71]:
# Concatenate DataFrames along columns (axis=1)
price_df_modified = pd.concat([diff, diff_2, diff_3], axis=1)

# Display the DataFrame
price_df_modified

Unnamed: 0,id,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power,peak_diff_dec_january_energy,peak_diff_dec_january_power,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0.000000,0.000000,0.000000,0.000000
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.500000,0.000000,0.000000,0.000000,0.000000
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916,-0.005120,0.097749,0.000763,0.065166
4,00114d74e963e47177db89bc70108537,-0.003994,-0.000001,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
16091,ffef185810e44254c3a4c6395e6b4d8a,-0.050232,-0.335085,-0.038788,-0.400251,-0.022735,-0.432834
16092,fffac626da707b1b5ab11e8431a4d0a2,-0.003778,0.177779,0.000000,0.000000,0.000000,0.000000
16093,fffc0cacd305dd51f316424bbb08d1bd,-0.001760,0.164916,-0.003707,0.099749,-0.007326,0.067166
16094,fffe4f5646aa39c7f97f95ae2679ce64,-0.009391,0.162916,-0.004937,0.097749,0.001029,0.065166


### Let's get the duration of first contract in days:

In [32]:

# Calculate the duration of the contract
df['Duration Of Contract'] = (df['date_end'] - df['date_activ']).dt.days

# Drop the original date columns
df.drop(columns=['date_end', 'date_activ'], inplace=True)

# Display the DataFrame with the duration of the contract
df


Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_modif_prod,date_renewal,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,...,var_6m_price_peak_var,var_6m_price_mid_peak_var,var_6m_price_off_peak_fix,var_6m_price_peak_fix,var_6m_price_mid_peak_fix,var_6m_price_off_peak,var_6m_price_peak,var_6m_price_mid_peak,churn,Duration Of Contract
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2015-11-01,2015-06-23,0.00,0,0.0,...,4.100838e-05,9.084737e-04,2.086294,99.530517,44.235794,2.086425,9.953056e+01,4.423670e+01,1,1096
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2015-08-31,189.95,0,0.0,...,1.217891e-03,0.000000e+00,0.009482,0.000000,0.000000,0.009485,1.217891e-03,0.000000e+00,0,2566
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2015-04-17,47.96,0,0.0,...,9.450150e-08,0.000000e+00,0.000000,0.000000,0.000000,0.000004,9.450150e-08,0.000000e+00,0,2192
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2015-03-31,240.04,0,0.0,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000003,0.000000e+00,0.000000e+00,0,2192
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2015-03-09,445.75,526,0.0,...,2.896760e-06,4.860000e-10,0.000000,0.000000,0.000000,0.000011,2.896760e-06,4.860000e-10,0,2245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14601,18463073fb097fc0ac5d3e040f356987,foosdfpfkusacimwkcsosbicdxkicaua,32270,47940,0,2015-05-08,2014-05-26,4648.01,0,0.0,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000003,0.000000e+00,0.000000e+00,0,1445
14602,d0a6f71671571ed83b2645d23af6de00,foosdfpfkusacimwkcsosbicdxkicaua,7223,0,181,2012-08-27,2015-08-28,631.69,181,0.0,...,2.225451e-06,1.802667e-08,0.014939,0.005682,0.000299,0.014948,5.684001e-03,2.987132e-04,1,1461
14603,10e6828ddd62cbcf687cb74928c4c2d2,foosdfpfkusacimwkcsosbicdxkicaua,1844,0,179,2012-02-08,2015-02-09,190.39,179,0.0,...,2.896760e-06,4.860000e-10,0.000000,0.000000,0.000000,0.000011,2.896760e-06,4.860000e-10,1,1460
14604,1cf20fd6206d7678d5bcafd28c53b4db,foosdfpfkusacimwkcsosbicdxkicaua,131,0,0,2012-08-30,2015-08-31,19.34,0,0.0,...,1.217891e-03,0.000000e+00,0.009482,0.000000,0.000000,0.009485,1.217891e-03,0.000000e+00,0,1461


### Now, Let's get the duration of last modification of product in days:

In [33]:
from datetime import datetime

# Get the present date as a Pandas Timestamp object
present_date = pd.Timestamp.now()


df['Days Since Last Modification of Product'] = (present_date - df['date_modif_prod']).dt.days

df.drop(columns=['date_modif_prod'],inplace =True)
df



Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_renewal,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,...,var_6m_price_mid_peak_var,var_6m_price_off_peak_fix,var_6m_price_peak_fix,var_6m_price_mid_peak_fix,var_6m_price_off_peak,var_6m_price_peak,var_6m_price_mid_peak,churn,Duration Of Contract,Days Since Last Modification of Product
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2015-06-23,0.00,0,0.0,1.78,...,9.084737e-04,2.086294,99.530517,44.235794,2.086425,9.953056e+01,4.423670e+01,1,1096,3069
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2015-08-31,189.95,0,0.0,16.27,...,0.000000e+00,0.009482,0.000000,0.000000,0.009485,1.217891e-03,0.000000e+00,0,2566,5332
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2015-04-17,47.96,0,0.0,38.72,...,0.000000e+00,0.000000,0.000000,0.000000,0.000004,9.450150e-08,0.000000e+00,0,2192,5094
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2015-03-31,240.04,0,0.0,19.83,...,0.000000e+00,0.000000,0.000000,0.000000,0.000003,0.000000e+00,0.000000e+00,0,2192,5111
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2015-03-09,445.75,526,0.0,131.73,...,4.860000e-10,0.000000,0.000000,0.000000,0.000011,2.896760e-06,4.860000e-10,0,2245,5187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14601,18463073fb097fc0ac5d3e040f356987,foosdfpfkusacimwkcsosbicdxkicaua,32270,47940,0,2014-05-26,4648.01,0,0.0,18.57,...,0.000000e+00,0.000000,0.000000,0.000000,0.000003,0.000000e+00,0.000000e+00,0,1445,3246
14602,d0a6f71671571ed83b2645d23af6de00,foosdfpfkusacimwkcsosbicdxkicaua,7223,0,181,2015-08-28,631.69,181,0.0,144.03,...,1.802667e-08,0.014939,0.005682,0.000299,0.014948,5.684001e-03,2.987132e-04,1,1461,4230
14603,10e6828ddd62cbcf687cb74928c4c2d2,foosdfpfkusacimwkcsosbicdxkicaua,1844,0,179,2015-02-09,190.39,179,0.0,129.60,...,4.860000e-10,0.000000,0.000000,0.000000,0.000011,2.896760e-06,4.860000e-10,1,1460,4431
14604,1cf20fd6206d7678d5bcafd28c53b4db,foosdfpfkusacimwkcsosbicdxkicaua,131,0,0,2015-08-31,19.34,0,0.0,7.18,...,0.000000e+00,0.009482,0.000000,0.000000,0.009485,1.217891e-03,0.000000e+00,0,1461,4227


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 43 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   id                                       14606 non-null  object        
 1   channel_sales                            14606 non-null  object        
 2   cons_12m                                 14606 non-null  int64         
 3   cons_gas_12m                             14606 non-null  int64         
 4   cons_last_month                          14606 non-null  int64         
 5   date_renewal                             14606 non-null  datetime64[ns]
 6   forecast_cons_12m                        14606 non-null  float64       
 7   forecast_cons_year                       14606 non-null  int64         
 8   forecast_discount_energy                 14606 non-null  float64       
 9   forecast_meter_rent_12m                

# Merging the client and price dataframes:

In [72]:
merged_df_clean = pd.merge(df,price_df_modified, on='id')
merged_df_clean.drop(columns=['id', 'origin_up','channel_sales'], inplace = True)
merged_df_clean

Unnamed: 0,cons_12m,cons_gas_12m,cons_last_month,date_renewal,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,...,var_6m_price_mid_peak,churn,Duration Of Contract,Days Since Last Modification of Product,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power,peak_diff_dec_january_energy,peak_diff_dec_january_power,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power
0,0,54946,0,2015-06-23,0.00,0,0.0,1.78,0.114481,0.098142,...,4.423670e+01,1,1096,3069,0.020057,3.700961,-0.017912,-24.339581,-0.071536,-16.226389
1,4660,0,0,2015-08-31,189.95,0,0.0,16.27,0.145711,0.000000,...,0.000000e+00,0,2566,5332,-0.003767,0.177779,0.000000,0.000000,0.000000,0.000000
2,544,0,0,2015-04-17,47.96,0,0.0,38.72,0.165794,0.087899,...,0.000000e+00,0,2192,5094,-0.004670,0.177779,0.000528,0.000000,0.000000,0.000000
3,1584,0,0,2015-03-31,240.04,0,0.0,19.83,0.146694,0.000000,...,0.000000e+00,0,2192,5111,-0.004547,0.177779,0.000000,0.000000,0.000000,0.000000
4,4425,0,526,2015-03-09,445.75,526,0.0,131.73,0.116900,0.100015,...,4.860000e-10,0,2245,5187,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14601,32270,47940,0,2014-05-26,4648.01,0,0.0,18.57,0.138305,0.000000,...,0.000000e+00,0,1445,3246,-0.008653,0.177779,0.000000,0.000000,0.000000,0.000000
14602,7223,0,181,2015-08-28,631.69,181,0.0,144.03,0.100167,0.091892,...,2.987132e-04,1,1461,4230,-0.007395,0.236694,-0.003727,0.145963,0.000260,0.033471
14603,1844,0,179,2015-02-09,190.39,179,0.0,129.60,0.116900,0.100015,...,4.860000e-10,1,1460,4431,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166
14604,131,0,0,2015-08-31,19.34,0,0.0,7.18,0.145711,0.000000,...,0.000000e+00,0,1461,4227,-0.003767,0.177779,0.000000,0.000000,0.000000,0.000000


# Saving the cleaned dataset:

In [73]:
# Save the DataFrame to a CSV file
merged_df_clean.to_csv('final_cleaned_data_after_FE.csv', index=False, header = True)

# Display a message indicating successful saving
print("DataFrame saved successfully as cleaned_data.csv")

DataFrame saved successfully as cleaned_data.csv
