In [12]:
import pandas as pd
import tarfile
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [13]:
df = pd.read_csv('life_expectancy.csv') 
df

Unnamed: 0,Country,Year,"Mean Years of Schooling, female (years)","Share of seats in parliament, male (% held by men)",Mean Years of Schooling (years),Inequality in income,Carbon dioxide emissions per capita (production) (tonnes),"Expected Years of Schooling, male (years)",Expected Years of Schooling (years),"Labour force participation rate, female (% ages 15 and older)",...,Mean Age Childbearing (years),Sex Ratio at Birth (males per 100 female births),Total Deaths (thousands),Male Deaths (thousands),Female Deaths (thousands),"Crude Death Rate (deaths per 1,000 population)","Life Expectancy at Birth, both sexes (years)",Live births Surviving to Age 1 (thousands),Net Number of Migrants (thousands),"Net Migration Rate (per 1,000 population)"
0,Afghanistan,1990,0.342503,,0.971125,,0.209727,3.939093,2.504050,15.180,...,29.880,105.5,203.514,110.102,93.412,18.449,45.9672,519.005,-666.001,-60.375
1,Afghanistan,1991,0.371860,,1.019356,,0.182525,4.171640,2.806550,15.214,...,29.812,105.6,192.531,104.274,88.257,17.946,46.6631,509.662,39.926,3.721
2,Afghanistan,1992,0.401218,,1.067586,,0.095233,4.417915,3.109050,15.223,...,29.885,105.6,191.913,104.633,87.280,17.222,47.5955,532.989,1834.556,164.629
3,Afghanistan,1993,0.430575,,1.115817,,0.084285,4.678730,3.411550,15.197,...,29.948,105.6,199.165,104.343,94.822,14.838,51.4664,646.635,1171.818,87.304
4,Afghanistan,1994,0.459933,,1.164047,,0.075054,4.954942,3.714050,15.178,...,29.789,105.7,222.214,119.122,103.092,14.689,51.4945,732.684,665.410,43.986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6203,Zimbabwe,2017,8.061880,63.848396,8.461690,27.01627,0.673026,12.039037,11.853943,80.285,...,27.979,102.2,122.177,62.092,60.084,8.266,60.7095,464.979,-59.918,-4.054
6204,Zimbabwe,2018,8.178377,65.714286,8.585395,27.01627,0.821010,12.154317,11.981767,80.309,...,27.956,102.3,120.252,60.873,59.379,7.972,61.4141,468.496,-59.918,-3.972
6205,Zimbabwe,2019,8.296558,65.428571,8.710909,28.76901,0.747610,12.270700,12.110969,80.339,...,27.934,102.3,123.744,63.152,60.592,8.043,61.2925,469.639,-59.918,-3.895
6206,Zimbabwe,2020,8.296558,65.428571,8.710909,28.76901,0.708562,12.270700,12.110969,78.980,...,27.916,102.4,127.551,66.267,61.284,8.132,61.1242,471.214,-29.955,-1.910


In [14]:
column_list = df.columns.to_list()
for i in column_list:
    print(i)

Country
Year
Mean Years of Schooling, female (years)
Share of seats in parliament, male (% held by men)
Mean Years of Schooling (years)
Inequality in income
Carbon dioxide emissions per capita (production) (tonnes)
Expected Years of Schooling, male (years)
Expected Years of Schooling (years)
Labour force participation rate, female (% ages 15 and older)
Population with at least some secondary education, female (% ages 25 and older)
Expected Years of Schooling, female (years)
Adolescent Birth Rate (births per 1,000 women ages 15-19)
Material footprint per capita (tonnes)
Mean Years of Schooling, male (years)
Population with at least some secondary education, male (% ages 25 and older)
Share of seats in parliament, female (% held by women)
Inequality in eduation
Gender Development Index (value)
Coefficient of human inequality
Labour force participation rate, male (% ages 15 and older)
Human Development Index (value)
Gross National Income Per Capita (2017 PPP$)
Total Population, as of 1 Ja

In [15]:
df.columns = df.columns.str.lower()
# Dictionary mapping old column names to new column names
column_rename_dict = {
    'country': 'country',
    'year': 'year',
    'mean years of schooling, female (years)': 'mean_years_of_schooling_female',
    'share of seats in parliament, male (% held by men)': 'share_of_male_seats_in_parliament',
    'mean years of schooling (years)': 'mean_years_of_schooling',
    'inequality in income': 'income_inequality',
    'carbon dioxide emissions per capita (production) (tonnes)': 'co2_emissions_per_capita_tonnes',
    'expected years of schooling, male (years)': 'expected_years_of_schooling_male',
    'expected years of schooling (years)': 'expected_years_of_schooling',
    'labour force participation rate, female (% ages 15 and older)': 'female_labour_force_participation_rate',
    'population with at least some secondary education, female (% ages 25 and older)': 'female_population_with_some_secondary_education',
    'expected years of schooling, female (years)': 'expected_years_of_schooling_female',
    'adolescent birth rate (births per 1,000 women ages 15-19)': 'adolescent_birth_rate_per_1000_women_ages_15_19',
    'material footprint per capita (tonnes)': 'material_footprint_per_capita_tonnes',
    'mean years of schooling, male (years)': 'mean_years_of_schooling_male',
    'population with at least some secondary education, male (% ages 25 and older)': 'male_population_with_some_secondary_education',
    'share of seats in parliament, female (% held by women)': 'share_of_female_seats_in_parliament',
    'inequality in eduation': 'education_inequality',
    'gender development index (value)': 'gender_development_index',
    'coefficient of human inequality': 'coefficient_of_human_inequality',
    'labour force participation rate, male (% ages 15 and older)': 'male_labour_force_participation_rate',
    'human development index (value)': 'human_development_index',
    'gross national income per capita (2017 ppp$)': 'gross_national_income_per_capita_2017_ppp',
    'total population, as of 1 january (thousands)': 'total_population_as_of_january_1st_thousands',
    'total population, as of 1 july (thousands)': 'total_population_as_of_july_1st_thousands',
    'male population, as of 1 july (thousands)': 'male_population_as_of_july_1st_thousands',
    'female population, as of 1 july (thousands)': 'female_population_as_of_july_1st_thousands',
    'population density, as of 1 july (persons per square km)': 'population_density_as_of_july_1st_persons_per_square_km',
    'population sex ratio, as of 1 july (males per 100 females)': 'population_sex_ratio_as_of_july_1st_males_per_100_females',
    'median age, as of 1 july (years)': 'median_age_as_of_july_1st_years',
    'natural change, births minus deaths (thousands)': 'natural_change_births_minus_deaths_thousands',
    'rate of natural change (per 1,000 population)': 'rate_of_natural_change_per_1000_population',
    'population change (thousands)': 'population_change_thousands',
    'population growth rate (percentage)': 'population_growth_rate_percentage',
    'population annual doubling time (years)': 'population_annual_doubling_time_years',
    'births (thousands)': 'births_thousands',
    'births by women aged 15 to 19 (thousands)': 'births_by_women_aged_15_to_19_thousands',
    'crude birth rate (births per 1,000 population)': 'crude_birth_rate_births_per_1000_population',
    'total fertility rate (live births per woman)': 'total_fertility_rate_live_births_per_woman',
    'net reproduction rate (surviving daughters per woman)': 'net_reproduction_rate_surviving_daughters_per_woman',
    'mean age childbearing (years)': 'mean_age_of_childbearing_years',
    'sex ratio at birth (males per 100 female births)': 'sex_ratio_at_birth_males_per_100_female_births',
    'total deaths (thousands)': 'total_deaths_thousands',
    'male deaths (thousands)': 'male_deaths_thousands',
    'female deaths (thousands)': 'female_deaths_thousands',
    'crude death rate (deaths per 1,000 population)': 'crude_death_rate_deaths_per_1000_population',
    'life expectancy at birth, both sexes (years)': 'life_expectancy_at_birth_both_sexes_years',
    'live births surviving to age 1 (thousands)': 'live_births_surviving_to_age_1_thousands',
    'net number of migrants (thousands)': 'net_number_of_migrants_thousands',
    'net migration rate (per 1,000 population)': 'net_migration_rate_per_1000_population'
}

# Rename columns
df_new = df.copy()
df_new = df.rename(columns=column_rename_dict, errors='raise')


In [16]:
df_new.head()

Unnamed: 0,country,year,mean_years_of_schooling_female,share_of_male_seats_in_parliament,mean_years_of_schooling,income_inequality,co2_emissions_per_capita_tonnes,expected_years_of_schooling_male,expected_years_of_schooling,female_labour_force_participation_rate,...,mean_age_of_childbearing_years,sex_ratio_at_birth_males_per_100_female_births,total_deaths_thousands,male_deaths_thousands,female_deaths_thousands,crude_death_rate_deaths_per_1000_population,life_expectancy_at_birth_both_sexes_years,live_births_surviving_to_age_1_thousands,net_number_of_migrants_thousands,net_migration_rate_per_1000_population
0,Afghanistan,1990,0.342503,,0.971125,,0.209727,3.939093,2.50405,15.18,...,29.88,105.5,203.514,110.102,93.412,18.449,45.9672,519.005,-666.001,-60.375
1,Afghanistan,1991,0.37186,,1.019356,,0.182525,4.17164,2.80655,15.214,...,29.812,105.6,192.531,104.274,88.257,17.946,46.6631,509.662,39.926,3.721
2,Afghanistan,1992,0.401218,,1.067586,,0.095233,4.417915,3.10905,15.223,...,29.885,105.6,191.913,104.633,87.28,17.222,47.5955,532.989,1834.556,164.629
3,Afghanistan,1993,0.430575,,1.115817,,0.084285,4.67873,3.41155,15.197,...,29.948,105.6,199.165,104.343,94.822,14.838,51.4664,646.635,1171.818,87.304
4,Afghanistan,1994,0.459933,,1.164047,,0.075054,4.954942,3.71405,15.178,...,29.789,105.7,222.214,119.122,103.092,14.689,51.4945,732.684,665.41,43.986


In [17]:
df_new.isnull().sum().to_csv('desc_groupby_country.csv')

In [18]:
label_encoder_df = LabelEncoder()

df_new["country_id"] = label_encoder_df.fit_transform(df_new["country"])
df_new = df_new.set_index("country", append=True)

y_col = ["life_expectancy_at_birth_both_sexes_years"]
x_col =[col for col in df_new.columns.to_list() if col not in y_col]

print(x_col)



['year', 'mean_years_of_schooling_female', 'share_of_male_seats_in_parliament', 'mean_years_of_schooling', 'income_inequality', 'co2_emissions_per_capita_tonnes', 'expected_years_of_schooling_male', 'expected_years_of_schooling', 'female_labour_force_participation_rate', 'female_population_with_some_secondary_education', 'expected_years_of_schooling_female', 'adolescent_birth_rate_per_1000_women_ages_15_19', 'material_footprint_per_capita_tonnes', 'mean_years_of_schooling_male', 'male_population_with_some_secondary_education', 'share_of_female_seats_in_parliament', 'education_inequality', 'gender_development_index', 'coefficient_of_human_inequality', 'male_labour_force_participation_rate', 'human_development_index', 'gross_national_income_per_capita_2017_ppp', 'total_population_as_of_january_1st_thousands', 'total_population_as_of_july_1st_thousands', 'male_population_as_of_july_1st_thousands', 'female_population_as_of_july_1st_thousands', 'population_density_as_of_july_1st_persons_per

In [19]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(df_new[x_col], df_new[y_col], test_size=0.2, random_state=42, stratify=df_new['country_id'])

In [21]:
def estimate_missing_values_grouped(df_group):
    # Calculate growth rates
    df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()

    # Calculate average growth rate
    # df_group['growth_rate'] = pd.to_numeric(df_group['growth_rate'], errors='coerce')
    avg_growth_rate = df_group['growth_rate'].mean()
    
    # Fill missing values using the average growth rate
    for i in range(1, len(df_group), -1):
        if pd.isna(df_group.at[df_group.index[i], 'expected_years_of_schooling']):
            df_group.at[df_group.index[i], 'expected_years_of_schooling'] = df_group.at[df_group.index[i+1], 'expected_years_of_schooling'] / (1 + avg_growth_rate)
    
    
    # Drop Growth Rate column
    df_group = df_group.drop(columns=['growth_rate'])
    
    return df_group

In [22]:
X_train_filled_avg = X_train.groupby('country_id').apply(estimate_missing_values_grouped)

  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = df_group['expected_years_of_schooling'].pct_change()
  df_group['growth_rate'] = 

In [11]:
X_train_filled_avg.to_csv('xtrain_avg.csv')

In [None]:
def fill_missing(group):
    if group.isnull().any():
        filled_fw = group.ffill()
        filled_bw = group.bfill()
        mean_values = (filled_fw + filled_bw) / 2
        print(mean_values)
        group_filled = group.fillna(mean_values)
        return group_filled
    else:
        return group

X_train_filled = X_train.groupby('country')['expected_years_of_schooling'].apply(fill_missing)

# Combine results into one DataFrame if not empty
X_train_filled = X_train_filled.reset_index(level=0, drop=True)
X_train_filled = pd.concat([X_train_filled], axis=1)

# Fill remaining missing values
X_train_filled['expected_years_of_schooling'] = X_train_filled['expected_years_of_schooling'].fillna(X_train_filled['expected_years_of_schooling'].mean())

# Save null counts per group to CSV
X_train_filled.isnull().sum().to_csv('desc_groupby_country_filled.csv')

X_train_filled.to_csv('xtrainfilled.csv')


In [None]:
X_train_filled_2 = X_train[X_train['country_id'].isin([113, 53])].groupby('country')['expected_years_of_schooling'].apply(fill_missing)


In [None]:
X_train[X_train['country_id'].isin([53])]

In [None]:
corrs = {x: df_new[x].corr(df_new[y_col[0]], method='pearson') for x in x_col}
corrs_df = pd.DataFrame.from_dict(corrs, orient='index', columns=['Correlation'])
corrs_df = corrs_df.sort_values(by='Correlation', ascending=False)

corrs_df

In [None]:
halfs = corrs_df[(corrs_df['Correlation'] >=0.5) | (corrs_df['Correlation']<=-0.5)]
halfs

We can drop the variable ```human_development_index``` as it is a function of life expectancy at birth, education, and other variables and therefore doesn't have any real causality to the life expectancy at birth variable. We can use ```expected_years_of_schooling``` as there may be causality (we can drop the female and male tagged versions of the variable).

In [None]:
reg = LinearRegression().fit(X_train[['expected_years_of_schooling']], y_train)
score = reg.score(X_train[['expected_years_of_schooling']], y_train)
coef = reg.coef_
intercept = reg.intercept_

print(score, coef, intercept)