<h1><center>Solving for Happiness - Data Engineering</center></h1>

In [1]:
# import libraries to be used throughout project
import pandas as pd
import numpy as np
import matplotlib 
import math
from matplotlib import pylab as plt

<h2>Dataset Engineering</h2>

The below code imports the main dataset, which houses 162 countries and their happiness index from 2003-2020. This dataframe is cleaned to only include the years (2005-2020) for which we have a corresponding target variable to predict. Countries with fewer than 16 years of recorded happiness indexes are also dropped from the dataframe.

In [2]:
#import my main dataset
df = pd.read_csv('../data/happiness-cantril-ladder.csv')

# rename and drop unneeded columns 
df = df.rename(columns={'Entity':'Country',
                        'Life satisfaction in Cantril Ladder (World Happiness Report 2022)':'Happy Score'})
df.drop(['Code'],axis=1,inplace=True)

# drop years 2003 and 2004 because there are not enough recoreded happiness indexes 
i = 0
in_2003 = []

for row in df['Year']:
    if row == 2003:
        in_2003.append(i) 
    i+=1

i = 0
in_2004 = []

for row in df['Year']:
    if row == 2004:
        in_2004.append(i)
    i+=1

df = df.drop(in_2003)
df = df.drop(in_2004)
df.reset_index(inplace=True)

# dropping countries with 15 or less years of happy index
countries_to_drop = []
every_country = df['Country'].value_counts().index.tolist()

i=0

# add countries with 15 or less years of happy index to list to drop
for country in every_country:
    if df['Country'].value_counts()[i] < 16:
        countries_to_drop.append(df['Country'].value_counts().index.tolist()[i])
    i+=1

# create indexes of the countries to use .drop method
i=0
drop_countries_index = []

for row in df['Country']:
    if row in countries_to_drop:
        drop_countries_index.append(i)
    i+=1
    
# print(countries_to_drop)
# print(drop_countries_index)

# drop countries with 15 or fewer happy indexes
df.drop(drop_countries_index, inplace=True)
df.reset_index(inplace=True)

print('Number of countries in dataset:', df['Country'].nunique())
print('Number of datapoints:',df.shape[0])

# drop redundant index columns
df.drop(['level_0','index'],axis=1,inplace=True)

# lists to use throughout merges
country_list = df['Country'].unique().tolist()
years = [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]

print('Size of Main dataset:',df.shape)
df.head()

Number of countries in dataset: 47
Number of datapoints: 752
Size of Main dataset: (752, 3)


Unnamed: 0,Country,Year,Happy Score
0,Argentina,2005,6.073158
1,Argentina,2006,5.961034
2,Argentina,2007,6.424133
3,Argentina,2008,6.441067
4,Argentina,2009,6.775805


<div class="alert alert-block alert-info">
    <b>Population Data:</b> Clean and merge country population data with the main dataframe.
  </div>

In [3]:
# import the population dataframe
pop = pd.read_csv('../data/Happiness-WVS-vs-Gallup.csv')
pop_missing = pd.read_csv('../data/missing_pop.csv')
print('Original data:',pop.shape)

# drop unneeded columns and rename population and country
pop.drop(['Code','Share of people who are happy (World Value Survey 2014)',
          'Life satisfaction in Cantril Ladder (World Happiness Report 2022)',
         'Continent'],axis=1,inplace=True)
pop.rename(columns={'Entity':'Country','Population (historical estimates)':'Population'},inplace=True)

# drop years that are not included in main dataset (2005-2020)
i = 0
year_drops = []

for row in pop['Year']:
    if row not in years:
        year_drops.append(i) 
    i+=1

pop.drop(year_drops, inplace=True)
pop.reset_index(inplace=True)
print('After year drop:',pop.shape)

# drop countries that are not in the main dataset 
i = 0
country_drops = []

for row in pop['Country']:
    if row not in country_list:
        country_drops.append(i) 
    i+=1

pop.drop(country_drops, inplace=True)
pop.reset_index(inplace=True)
print('After country drop:',pop.shape)

# concatentate population data with the missing population data
kos = [400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415]
pop_list = []
pop.drop(kos,axis=0,inplace=True)
pop_list.append(pop)
pop_list.append(pop_missing)
pop_2 = pd.concat(pop_list)

# merge main with population 
df = df.merge(pop_2, on=['Country','Year'])
df.drop(['level_0','index'],axis=1,inplace=True)

print('Main dataset after merge:',df.shape)

cc = pop_2['Country'].unique()

# find countries missing gdp data
missing_pop = []

for place in country_list:
    if place not in cc:
        missing_pop.append(place)

print('Countries missing population data:', missing_pop)

df.head()
df.loc[df['Country']=='Kosovo']
df.head()

Original data: (55734, 7)
After year drop: (3976, 4)
After country drop: (752, 5)
Main dataset after merge: (752, 4)
Countries missing population data: []


Unnamed: 0,Country,Year,Happy Score,Population
0,Argentina,2005,6.073158,38892924.0
1,Argentina,2006,5.961034,39289876.0
2,Argentina,2007,6.424133,39684303.0
3,Argentina,2008,6.441067,40080159.0
4,Argentina,2009,6.775805,40482786.0


<div class="alert alert-block alert-info">
    <b>Life Expectancy Data:</b> Clean and merge country life expectancy data with the main dataframe.
  </div>

In [4]:
# import life expectancy data already cleaned
life = pd.read_csv('../data/life.csv')
print('Original data:',life.shape)

life.rename({'Life expectancy':'Life Expectancy'},axis=1,inplace=True)
# merge life with main dataframe
df = df.merge(life, on=['Country','Year'],how='left')
# df.drop(['index'],axis=1,inplace=True)

print('Main dataset after merge:',df.shape)

df.head(15)

Original data: (752, 3)
Main dataset after merge: (752, 5)


Unnamed: 0,Country,Year,Happy Score,Population,Life Expectancy
0,Argentina,2005,6.073158,38892924.0,74.1
1,Argentina,2006,5.961034,39289876.0,74.0
2,Argentina,2007,6.424133,39684303.0,74.1
3,Argentina,2008,6.441067,40080159.0,74.1
4,Argentina,2009,6.775805,40482786.0,74.7
5,Argentina,2010,6.468387,40895751.0,74.9
6,Argentina,2011,6.58226,41320497.0,75.2
7,Argentina,2012,6.671114,41755188.0,74.8
8,Argentina,2013,6.697131,42196034.0,75.4
9,Argentina,2014,6.427221,42637508.0,75.6


<div class="alert alert-block alert-info">
    <b>GDP Data:</b> Clean and merge country GDP data with the main dataframe.
  </div>

In [5]:
# import GDP data
gdp = pd.read_csv('../data/world_country_gdp_usd.csv')
gdp_missing = pd.read_csv('../data/missing_gdp.csv')
print('Original data:',gdp.shape)

# drop unneeded columns and rename population and country
gdp.drop(['Country Code'],axis=1,inplace=True)
gdp.rename(columns={'Country Name':'Country', 'year':'Year'},inplace=True)
gdp_missing.rename(columns={'GDP':'GDP_USD','GDP_per_capita':'GDP_per_capita_USD'},inplace=True)

# drop years that are not included in main dataset (2005-2020)
i = 0
year_drops = []

for row in gdp['Year']:
    if row not in years:
        year_drops.append(i) 
    i+=1

gdp.drop(year_drops, inplace=True)
gdp.reset_index(inplace=True)
print('After year drop:',gdp.shape)

# drop countries that are not in the main dataset 
i = 0
country_drops = []

for row in gdp['Country']:
    if row not in country_list:
        country_drops.append(i) 
    i+=1

gdp.drop(country_drops, inplace=True)
gdp.reset_index(inplace=True)


# concatentate gdp data with the missing gdp data
gdp_list = []
gdp_list.append(gdp)
gdp_list.append(gdp_missing)
gdp_2 = pd.concat(gdp_list)

print('After country drop:',gdp_2.shape)
# merge main with population 
df = df.merge(gdp_2, on=['Country','Year'],how='left')
df.drop(['level_0','index'],axis=1,inplace=True)

print('Main dataset after merge:',df.shape)


# find countries missing gdp data
cc = gdp_2['Country'].unique()

missing_gdp = []

for place in country_list:
    if place not in cc:
        missing_gdp.append(place)

print('Countries missing gdp data:', missing_gdp)


df.head(15)
gdp_missing

Original data: (16492, 5)
After year drop: (4256, 5)
After country drop: (752, 6)
Main dataset after merge: (752, 7)
Countries missing gdp data: []


Unnamed: 0,Country,Year,GDP_USD,GDP_per_capita_USD
0,Egypt,2005,89600665557,1186.393313
1,Egypt,2006,107426086957,1397.436690
2,Egypt,2007,130437828371,1667.317998
3,Egypt,2008,162818181818,2044.527804
4,Egypt,2009,189147005445,2331.268840
...,...,...,...,...
75,Turkey,2016,869692960366,10894.603380
76,Turkey,2017,858996263096,10589.667720
77,Turkey,2018,778377023569,9453.196172
78,Turkey,2019,761428183369,9126.594392


<div class="alert alert-block alert-info">
    <b>CO2 Emissions Data:</b> Clean and merge country CO2 emissions data with the main dataframe.
  </div>

In [6]:
# import the co2 dataframe
co2 = pd.read_csv('../data/CO2_cleaned_restructured.csv')
co2_missing = pd.read_csv('../data/missing_co2.csv')
print('Original data:',co2.shape)

# drop unneeded columns and rename population and country
co2 = co2[['Country','Year','CO2 emission (Tons)']]

# drop years that are not included in main dataset (2005-2020)
i = 0
year_drops = []

for row in co2['Year']:
    if row not in years:
        year_drops.append(i) 
    i+=1

co2.drop(year_drops, inplace=True)
co2.reset_index(inplace=True)
print('After year drop:',co2.shape)

# drop countries that are not in the main dataset 
i = 0
country_drops = []

for row in co2['Country']:
    if row not in country_list:
        country_drops.append(i) 
    i+=1

co2.drop(country_drops, inplace=True)
co2.reset_index(inplace=True)
print('After country drop:',co2.shape)

# concatentate gdp data with the missing gdp data
co2_list = []
co2_list.append(co2)
co2_list.append(co2_missing)
co2_2 = pd.concat(co2_list)

# merge main with population 
df = df.merge(co2_2, on=['Country','Year'],how='left')
df.drop(['level_0','index'],axis=1,inplace=True)

print('Main dataset after merge:',df.shape)

cc = co2_2['Country'].unique()

# find countries missing gdp data
missing_co2 = []

for place in country_list:
    if place not in cc:
        missing_co2.append(place)

print('Countries missing co2 data:', missing_co2)

df.head(32)

Original data: (48509, 14)
After year drop: (2864, 4)
After country drop: (688, 5)
Main dataset after merge: (752, 8)
Countries missing co2 data: []


Unnamed: 0,Country,Year,Happy Score,Population,Life Expectancy,GDP_USD,GDP_per_capita_USD,CO2 emission (Tons)
0,Argentina,2005,6.073158,38892924.0,74.1,198737000000.0,5109.852245,5690960000.0
1,Argentina,2006,5.961034,39289876.0,74.0,232557000000.0,5919.012338,5865569000.0
2,Argentina,2007,6.424133,39684303.0,74.1,287531000000.0,7245.446857,6039814000.0
3,Argentina,2008,6.441067,40080159.0,74.1,361558000000.0,9020.873323,6227920000.0
4,Argentina,2009,6.775805,40482786.0,74.7,332976000000.0,8225.137583,6406870000.0
5,Argentina,2010,6.468387,40895751.0,74.9,423627000000.0,10385.96443,6593603000.0
6,Argentina,2011,6.58226,41320497.0,75.2,530163000000.0,12848.8642,6783877000.0
7,Argentina,2012,6.671114,41755188.0,74.8,545982000000.0,13082.66433,6975591000.0
8,Argentina,2013,6.697131,42196034.0,75.4,552025000000.0,13080.25473,7165725000.0
9,Argentina,2014,6.427221,42637508.0,75.6,526320000000.0,12334.79825,7354382000.0


<div class="alert alert-block alert-info">
    <b>Save Completed CSV</b>
  </div>

In [7]:
# convert dataframe to csv file 
df.to_csv('../data/happiness.csv',sep=',')
print('Main dataframe size:',df.shape)
df.head(16)

Main dataframe size: (752, 8)


Unnamed: 0,Country,Year,Happy Score,Population,Life Expectancy,GDP_USD,GDP_per_capita_USD,CO2 emission (Tons)
0,Argentina,2005,6.073158,38892924.0,74.1,198737000000.0,5109.852245,5690960000.0
1,Argentina,2006,5.961034,39289876.0,74.0,232557000000.0,5919.012338,5865569000.0
2,Argentina,2007,6.424133,39684303.0,74.1,287531000000.0,7245.446857,6039814000.0
3,Argentina,2008,6.441067,40080159.0,74.1,361558000000.0,9020.873323,6227920000.0
4,Argentina,2009,6.775805,40482786.0,74.7,332976000000.0,8225.137583,6406870000.0
5,Argentina,2010,6.468387,40895751.0,74.9,423627000000.0,10385.96443,6593603000.0
6,Argentina,2011,6.58226,41320497.0,75.2,530163000000.0,12848.8642,6783877000.0
7,Argentina,2012,6.671114,41755188.0,74.8,545982000000.0,13082.66433,6975591000.0
8,Argentina,2013,6.697131,42196034.0,75.4,552025000000.0,13080.25473,7165725000.0
9,Argentina,2014,6.427221,42637508.0,75.6,526320000000.0,12334.79825,7354382000.0


<div class="alert alert-block alert-info">
    <b>Lag Data:</b> Concatenate new dataframe with all features lagged for EDA and processing.
  </div>

In [8]:
# lag data one year
countries = df['Country'].unique()
df_list = []

# create lagged dataframe for each country 
for c in countries:
    
    # country specific dataframe
    df_temp = df.loc[df['Country']==c,:]

    # features to lag 
    c = df_temp['Country']
    year = df_temp['Year']
    y = df_temp['Happy Score']
    pop = df_temp['Population']
    life = df_temp['Life Expectancy']
    gdp = df_temp['GDP_USD']
    gdp_cap = df_temp['GDP_per_capita_USD']
    co2 = df_temp['CO2 emission (Tons)']
    
    # create lagged data      
    X_temp = pd.concat([c, year, y, 
                        pop.shift(1), pop,
                        life.shift(1), life,
                        gdp.shift(1), gdp,
                        gdp_cap.shift(1), gdp_cap,
                        co2.shift(1), co2,
                        y.shift(1)]
                        ,axis=1)
    
    # append temp country lagged to list      
    df_list.append(X_temp)
 

# create master lagged df
df_lag1 = pd.concat(df_list)

df_lag1.columns = ['Country','Target Year','Target Score',
             'Population lag 1 year', 'Population Current',
             'Life Ex lag 1 year', 'Life Ex Current',
             'GDP lag 1 year', 'GDP Current',
             'GDP_cap lag 1 year', 'GDP_cap Current',
             'CO2 lag 1 year', 'CO2 Current',
             'Happy lag 1 year']

print('Size of 1 year lagged dataframe:', df_lag1.shape)
# df_lag1.head(16)

Size of 1 year lagged dataframe: (752, 14)


In [9]:
# lag data two years 
countries = df['Country'].unique()
df_list = []

# create lagged dataframe for each country 
for c in countries:
    
    # country specific dataframe
    df_temp = df.loc[df['Country']==c,:]

    # features to lag 
    c = df_temp['Country']
    year = df_temp['Year']
    y = df_temp['Happy Score']
    pop = df_temp['Population']
    life = df_temp['Life Expectancy']
    gdp = df_temp['GDP_USD']
    gdp_cap = df_temp['GDP_per_capita_USD']
    co2 = df_temp['CO2 emission (Tons)']
    
    # create lagged data      
    X_temp = pd.concat([c, year, y, 
                        pop.shift(2), pop.shift(1), pop,
                        life.shift(2),life.shift(1), life,
                        gdp.shift(2),gdp.shift(1), gdp,
                        gdp_cap.shift(2),gdp_cap.shift(1), gdp_cap,
                        co2.shift(2),co2.shift(1), co2,
                        y.shift(2),y.shift(1)]
                        ,axis=1)
    
    # append temp country lagged to list      
    df_list.append(X_temp)
 

# create master lagged df
df_lag2 = pd.concat(df_list)

df_lag2.columns = ['Country','Target Year','Target Score',
             'Population lag 2 years', 'Population lag 1 year', 'Population Current',
             'Life Ex lag 2 years', 'Life Ex lag 1 year', 'Life Ex Current',
             'GDP lag 2 years', 'GDP lag 1 year', 'GDP Current',
             'GDP_cap lag 2 years', 'GDP_cap lag 1 year', 'GDP_cap Current',
             'CO2 lag 2 years', 'CO2 lag 1 year', 'CO2 Current',
             'Happy lag 2 years', 'Happy lag 1 year']

print('Size of 2 years lagged dataframe:', df_lag2.shape)
# df_lag2.head(16)

Size of 2 years lagged dataframe: (752, 20)


In [10]:
# lag for three years 
df_list = []

# create lagged dataframe for each country 
for c in countries:
    
    # country specific dataframe
    df_temp = df.loc[df['Country']==c,:]

    # features to lag 
    c = df_temp['Country']
    year = df_temp['Year']
    y = df_temp['Happy Score']
    pop = df_temp['Population']
    life = df_temp['Life Expectancy']
    gdp = df_temp['GDP_USD']
    gdp_cap = df_temp['GDP_per_capita_USD']
    co2 = df_temp['CO2 emission (Tons)']
    
    # create lagged data      
    X_temp = pd.concat([c, year, y, 
                        pop.shift(3),pop.shift(2), pop.shift(1), pop,
                        life.shift(3),life.shift(2),life.shift(1), life,
                        gdp.shift(3),gdp.shift(2),gdp.shift(1), gdp,
                        gdp_cap.shift(3),gdp_cap.shift(2),gdp_cap.shift(1), gdp_cap,
                        co2.shift(3),co2.shift(2),co2.shift(1), co2,
                        y.shift(3),y.shift(2),y.shift(1)]
                        ,axis=1)
    
    # append temp country lagged to list      
    df_list.append(X_temp)
 

# create master lagged df
df_lag3 = pd.concat(df_list)

df_lag3.columns = ['Country','Target Year','Target Score',
             'Population lag 3 years', 'Population lag 2 years', 'Population lag 1 year', 'Population Current',
             'Life Ex lag 3 years', 'Life Ex lag 2 years', 'Life Ex lag 1 year', 'Life Ex Current',
             'GDP lag 3 years', 'GDP lag 2 years', 'GDP lag 1 year', 'GDP Current',
             'GDP_cap lag 3 years', 'GDP_cap lag 2 years', 'GDP_cap lag 1 year', 'GDP_cap Current',
             'CO2 lag 3 years', 'CO2 lag 2 years', 'CO2 lag 1 year', 'CO2 Current',
             'Happy lag 3 years', 'Happy lag 2 years', 'Happy lag 1 year']

print('Size of 3 years lagged dataframe:', df_lag3.shape)
# df_lag3.head(16)

Size of 3 years lagged dataframe: (752, 26)


In [11]:
# lag data four years 
df_list = []

# create lagged dataframe for each country 
for c in countries:
    
    # country specific dataframe
    df_temp = df.loc[df['Country']==c,:]

    # features to lag 
    c = df_temp['Country']
    year = df_temp['Year']
    y = df_temp['Happy Score']
    pop = df_temp['Population']
    life = df_temp['Life Expectancy']
    gdp = df_temp['GDP_USD']
    gdp_cap = df_temp['GDP_per_capita_USD']
    co2 = df_temp['CO2 emission (Tons)']
    
    # create lagged data      
    X_temp = pd.concat([c, year, y, 
                        pop.shift(4),pop.shift(3),pop.shift(2), pop.shift(1), pop,
                        life.shift(4),life.shift(3),life.shift(2),life.shift(1), life,
                        gdp.shift(4),gdp.shift(3),gdp.shift(2),gdp.shift(1), gdp,
                        gdp_cap.shift(4),gdp_cap.shift(3),gdp_cap.shift(2),gdp_cap.shift(1), gdp_cap,
                        co2.shift(4),co2.shift(3),co2.shift(2),co2.shift(1), co2,
                        y.shift(4),y.shift(3),y.shift(2),y.shift(1)]
                        ,axis=1)
    
    # append temp country lagged to list      
    df_list.append(X_temp)
 

# create master lagged df
df_lag4 = pd.concat(df_list)

df_lag4.columns = ['Country','Target Year','Target Score',
             'Population lag 4 years','Population lag 3 years', 'Population lag 2 years', 'Population lag 1 year', 'Population Current',
             'Life Ex lag 4 years','Life Ex lag 3 years', 'Life Ex lag 2 years', 'Life Ex lag 1 year', 'Life Ex Current',
             'GDP lag 4 years','GDP lag 3 years', 'GDP lag 2 years', 'GDP lag 1 year', 'GDP Current',
             'GDP_cap lag 4 years','GDP_cap lag 3 years', 'GDP_cap lag 2 years', 'GDP_cap lag 1 year', 'GDP_cap Current',
             'CO2 lag 4 years','CO2 lag 3 years', 'CO2 lag 2 years', 'CO2 lag 1 year', 'CO2 Current',
             'Happy lag 4 years','Happy lag 3 years', 'Happy lag 2 years', 'Happy lag 1 year']

print('Size of four years lagged dataframe:', df_lag4.shape)
# df_lag4.head(16)

Size of four years lagged dataframe: (752, 32)


In [12]:
# lag data five years 
df_list = []

# create lagged dataframe for each country 
for c in countries:
    
    # country specific dataframe
    df_temp = df.loc[df['Country']==c,:]

    # features to lag 
    c = df_temp['Country']
    year = df_temp['Year']
    y = df_temp['Happy Score']
    pop = df_temp['Population']
    life = df_temp['Life Expectancy']
    gdp = df_temp['GDP_USD']
    gdp_cap = df_temp['GDP_per_capita_USD']
    co2 = df_temp['CO2 emission (Tons)']
    
    # create lagged data      
    X_temp = pd.concat([c, year, y, 
                        pop.shift(5),pop.shift(4),pop.shift(3),pop.shift(2), pop.shift(1), pop,
                        life.shift(5),life.shift(4),life.shift(3),life.shift(2),life.shift(1), life,
                        gdp.shift(5),gdp.shift(4),gdp.shift(3),gdp.shift(2),gdp.shift(1), gdp,
                        gdp_cap.shift(5),gdp_cap.shift(4),gdp_cap.shift(3),gdp_cap.shift(2),gdp_cap.shift(1), gdp_cap,
                        co2.shift(5),co2.shift(4),co2.shift(3),co2.shift(2),co2.shift(1), co2,
                        y.shift(5),y.shift(4),y.shift(3),y.shift(2),y.shift(1)]
                        ,axis=1)
    
    # append temp country lagged to list      
    df_list.append(X_temp)
 

# create master lagged df
df_lag5 = pd.concat(df_list)

df_lag5.columns = ['Country','Target Year','Target Score',
             'Population lag 5 years','Population lag 4 years','Population lag 3 years', 'Population lag 2 years', 'Population lag 1 year', 'Population Current',
             'Life Ex lag 5 years','Life Ex lag 4 years','Life Ex lag 3 years', 'Life Ex lag 2 years', 'Life Ex lag 1 year', 'Life Ex Current',
             'GDP lag 5 years','GDP lag 4 years','GDP lag 3 years', 'GDP lag 2 years', 'GDP lag 1 year', 'GDP Current',
             'GDP_cap lag 5 years','GDP_cap lag 4 years','GDP_cap lag 3 years', 'GDP_cap lag 2 years', 'GDP_cap lag 1 year', 'GDP_cap Current',
             'CO2 lag 5 years','CO2 lag 4 years','CO2 lag 3 years', 'CO2 lag 2 years', 'CO2 lag 1 year', 'CO2 Current',
             'Happy lag 5 years','Happy lag 4 years','Happy lag 3 years', 'Happy lag 2 years', 'Happy lag 1 year']

print('Size of 5 years lagged dataframe:', df_lag5.shape)
# df_lag5.head(16)

Size of 5 years lagged dataframe: (752, 38)


In [14]:
# convert lagged dataframes to csv file 
df_lag1.to_csv('../data/happiness_lagged1.csv',sep=',')
df_lag2.to_csv('../data/happiness_lagged2.csv',sep=',')
df_lag3.to_csv('../data/happiness_lagged3.csv',sep=',')
df_lag4.to_csv('../data/happiness_lagged4.csv',sep=',')
df_lag5.to_csv('../data/happiness_lagged5.csv',sep=',')
# print('Main dataframe size:',X.shape)