In [18]:
from sklearn import model_selection
from sklearn.model_selection import KFold, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

##### Load data

dataset = pd.read_csv("../../../Life-Expectancy-Data.csv")

classCategories = np.asarray(dataset["Economy_status_Developed"])

dataset = dataset.drop(["Economy_status_Developed", 'Economy_status_Developing', 'Country'], axis=1)





##### Preprocessing


with_region = True

cols = [col for col in dataset.columns if col not in ['Economy_status_Developed']]


if with_region:
    region_unique = dataset['Region'].unique()
    region_encoder = OneHotEncoder()
    region_encoded = region_encoder.fit_transform(dataset[['Region']])
    print(region_encoded.toarray().shape)
    region_encoded_df = pd.DataFrame(region_encoded.toarray(), columns=region_encoder.get_feature_names_out())
    print(sum(region_encoded_df["Region_North America"])/len(region_encoded_df["Region_North America"]))
    print(region_encoded_df["Region_North America"].value_counts())
    print(dataset.shape)
    raw_data = pd.concat([dataset, region_encoded_df], axis=1)
    print(raw_data.shape)

    raw_data = raw_data.drop(['Region'], axis=1)
    cols.remove('Region')
    cols = cols + list(region_encoded_df.columns)
    y = classCategories
    X = raw_data[cols].values
else:
    cols.remove('Region')
    dataset = dataset.drop(['Region'], axis=1)
    X = dataset[cols].values
    y = classCategories





(2864, 9)
0.01675977653631285
Region_North America
0.0    2816
1.0      48
Name: count, dtype: int64
(2864, 18)
(2864, 27)


In [19]:
##### Run cross validation

    
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    
trainStd = np.std(X_train, axis=0)
trainMean = np.mean(X_train, axis=0)

testNorm = (X_test - trainMean) / trainStd 
trainNorm = (X_train - trainMean) / trainStd 

model = LogisticRegression(penalty="l2",solver="saga",max_iter=5000, C=1/0.034551, random_state=0)

model.fit(trainNorm, y_train)
for i, coef in enumerate(model.coef_[0]):
    print("Attribute: ", cols[i], " Coef: ", coef)
model.coef_[0]



(2864, 26)
Attribute:  Year  Coef:  -0.9969282991945269
Attribute:  Infant_deaths  Coef:  0.05157569729562849
Attribute:  Under_five_deaths  Coef:  0.7386164550937252
Attribute:  Adult_mortality  Coef:  -2.113793564608071
Attribute:  Alcohol_consumption  Coef:  2.769857619424301
Attribute:  Hepatitis_B  Coef:  -1.2646668012257085
Attribute:  Measles  Coef:  -0.8516585785925759
Attribute:  BMI  Coef:  -3.601954397597585
Attribute:  Polio  Coef:  -0.009865246348470115
Attribute:  Diphtheria  Coef:  1.0024314698559147
Attribute:  Incidents_HIV  Coef:  0.6328468700777843
Attribute:  GDP_per_capita  Coef:  1.8753835672985366
Attribute:  Population_mln  Coef:  0.8836355528556551
Attribute:  Thinness_ten_nineteen_years  Coef:  -3.1161567032784343
Attribute:  Thinness_five_nine_years  Coef:  -3.15076560969885
Attribute:  Schooling  Coef:  3.102369545226979
Attribute:  Life_expectancy  Coef:  4.371179701285574
Attribute:  Region_Africa  Coef:  1.3783038636934397
Attribute:  Region_Asia  Coef:  

array([-0.9969283 ,  0.0515757 ,  0.73861646, -2.11379356,  2.76985762,
       -1.2646668 , -0.85165858, -3.6019544 , -0.00986525,  1.00243147,
        0.63284687,  1.87538357,  0.88363555, -3.1161567 , -3.15076561,
        3.10236955,  4.3711797 ,  1.37830386, -3.43507439, -1.8231258 ,
        4.15383881,  0.65907125, -0.01433473,  0.61107004, -0.99117699,
       -1.53687086])

In [14]:
lambdas = np.logspace(-3, 1, 40)
for l in lambda_interval:
    print(l, " and ", 1/l)
    

1e-08  and  100000000.0
2.6101572156825384e-08  and  38311868.49557285
6.812920690579608e-08  and  14677992.676220706
1.7782794100389227e-07  and  5623413.251903491
4.641588833612782e-07  and  2154434.6900318824
1.21152765862859e-06  and  825404.1852680173
3.162277660168379e-06  and  316227.76601683797
8.25404185268019e-06  and  121152.76586285875
2.1544346900318867e-05  and  46415.888336127726
5.623413251903491e-05  and  17782.79410038923
0.00014677992676220705  and  6812.920690579608
0.0003831186849557293  and  2610.157215682533
0.001  and  1000.0
0.0026101572156825388  and  383.1186849557285
0.006812920690579622  and  146.77992676220674
0.01778279410038923  and  56.2341325190349
0.04641588833612782  and  21.54434690031882
0.12115276586285902  and  8.254041852680173
0.31622776601683794  and  3.162277660168379
0.825404185268019  and  1.2115276586285877
2.1544346900318865  and  0.4641588833612773
5.623413251903491  and  0.17782794100389226
14.677992676220736  and  0.06812920690579594
3