# Validation

> Team Name: *S Legends*
>
> Team Members: Myles, Tani, Arjan, Archie

In [106]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import statsmodels.api as sm
import statsmodels.tools
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

**The validation method we use is K-fold cross-validation:**  

**What is K-fold cross-validation?**  
This splits up the whole dataset into a K number of subsets (folds), and then uses K-1 folds as the training set and validates on the remaining 1 fold.

**Why use K-fold cross_validation?**  
This means that every observation in the data is used to test one time and it can provide a better estimate of the accuracy of the model.

In [19]:
df = pd.read_csv('Life Expectancy Data.csv') # Load the original dataset

In [189]:
kf = KFold(n_splits = 5, shuffle=True, random_state = 42) # Initialise the KFold function, informing it how many splits to make

In [164]:
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy')

X = df[feature_cols]
y = df['Life_expectancy']

In [193]:
fold_indices = [] # Store the train-test indices as a tuple for each fold

for i, (train_indices, test_indices) in enumerate(kf.split(X)):
    fold_indices.append((train_indices, test_indices))

Feature Engineer using function made previously:

In [205]:
# Function created to feature engineer.
def feature_engineering(X):
    
    X = X.drop(columns = ["Country", "Region", "Year"])
    
    # Remove highly correlated features to reduce multicollinearity.
    corr_matrix = X.corr().abs() # Creates a correlation matrix for all the values in the X dataframe, making sure its absolute values (this prevents a division by zero error).
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool)) # Takes the upper triangle of the correlation matrix, k = 1, is the upper diagonal, and changes the type to boolean.
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # Takes all the columns with correlation greater than 0.9 in one list.
    X = X.drop(columns=to_drop) # Drops all of the columns with the correlations > 0.9 - this is the feature selection step.

    # Remove features with low variance or high VIF
    X_scaled = StandardScaler().fit_transform(X) # Uses the StandardScaler() from sklearn and uses fit_transform on X to scale it using the mean and standard deviation.
    X_df = pd.DataFrame(X_scaled, columns = X.columns) # Converts the above X_scaled back into a dataframe - as it is an np array.
    
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_df.columns # Adds a column called feature and fills it with the columns from X.
    vif_data["VIF"] = [variance_inflation_factor(X_df.values, i) for i in range(X_df.shape[1])] # Iterates through and calculates the VIF for each feature.
    X = X_df.drop(columns = vif_data[vif_data["VIF"] > 10]["feature"]) # Drops the columns with a VIF value > 10, as this suggests high multicollinearity.

    # Returns the feature engineered version of X.
    return X

**Validation:**

In [211]:
def validation(kf, fold_indices, X, y):
    rmse = []
    rsquared = []
    cond_no = []
    
    for fold in range(kf.get_n_splits(X)+1):

        # Training on the fold's training set
        train_index = fold_indices[fold][0]
        test_index = fold_indices[fold][1]
        
        X_fold_train = X.loc[train_index]
        y_fold_train = y.loc[train_index]
        
        X_fold_train_fe = feature_engineering(X_fold_train)
        y_fold_train = y_fold_train.loc[X_fold_train_fe.index] 

        model = sm.OLS(y_fold_train, X_fold_train_fe)
        results = model.fit()


        # Testing on the fold's testing set
        X_fold_test = X.loc[test_index]
        y_fold_test = y.loc[test_index]

        X_fold_test_fe = feature_engineering(X_fold_test)
        y_fold_test = y_fold_test.loc[X_fold_test_fe.index]
        y_pred = results.predict(X_fold_test_fe)
    
        R_squared1 = results.rsquared
        cond_no1 = results.condition_number
        rmse1 = statsmodels.tools.eval_measures.rmse(y_fold_test, y_pred)

        rmse.append(rmse1)
        rsquared.append(R_squared1)
        cond_no.append(cond_no1)

    mean_rmse = np.mean(rmse)
    mean_rsquared = np.mean(rsquared)
    mean_cond_no = np.mean(cond_no)

    return mean_rmse, mean_rsquared, mean_cond_no

In [213]:
(rmse, rsquared, mean_cond_no) =  validation(kf, fold_indices, X, y)

KeyError: '[29, 30, 32, 43, 44, 45, 51, 56, 63, 67, 70, 73, 80, 87, 93, 96, 102, 104, 108, 109, 111, 124, 134, 135, 141, 149, 152, 163, 168, 170, 173, 175, 178, 188, 194, 196, 199, 210, 211, 212, 218, 221, 226, 231, 233, 237, 239, 246, 251, 252, 254, 257, 259, 270, 272, 282, 289, 291, 296, 298, 309, 313, 314, 318, 321, 322, 324, 331, 332, 354, 358, 365, 367, 368, 381, 387, 393, 402, 407, 408, 410, 411, 414, 415, 420, 422, 430, 432, 433, 435, 436, 439, 443, 450, 456, 457, 460, 462, 463, 464, 471, 478, 479, 485, 486, 495, 498, 506, 507, 509, 511, 518, 521, 527, 528, 532, 533, 535, 542, 543, 544, 554, 557, 564, 565, 567, 568, 572, 581, 594, 598, 599, 602, 605, 610, 612, 642, 644, 650, 655, 662, 676, 677, 678, 679, 685, 691, 693, 695, 700, 705, 707, 727, 736, 741, 742, 744, 755, 759, 765, 767, 772, 783, 785, 787, 790, 794, 798, 802, 807, 809, 817, 819, 831, 834, 840, 841, 847, 857, 869, 879, 889, 900, 903, 907, 911, 912, 926, 927, 930, 937, 940, 941, 942, 958, 962, 965, 978, 986, 999, 1005, 1011, 1017, 1023, 1025, 1027, 1029, 1033, 1034, 1053, 1055, 1058, 1061, 1064, 1068, 1075, 1080, 1088, 1090, 1091, 1093, 1097, 1099, 1102, 1103, 1110, 1114, 1117, 1123, 1125, 1151, 1159, 1161, 1177, 1179, 1187, 1190, 1196, 1200, 1206, 1207, 1208, 1221, 1225, 1228, 1229, 1230, 1234, 1236, 1242, 1244, 1251, 1255, 1260, 1268, 1270, 1271, 1278, 1286, 1288, 1289, 1292, 1293, 1298, 1313, 1316, 1317, 1323, 1334, 1336, 1338, 1339, 1344, 1350, 1352, 1361, 1362, 1368, 1370, 1376, 1378, 1380, 1381, 1389, 1391, 1392, 1393, 1395, 1398, 1410, 1412, 1413, 1417, 1418, 1419, 1421, 1422, 1427, 1431, 1436, 1444, 1449, 1453, 1457, 1461, 1467, 1472, 1480, 1494, 1503, 1507, 1511, 1512, 1533, 1536, 1539, 1550, 1554, 1556, 1575, 1577, 1580, 1583, 1584, 1586, 1588, 1590, 1599, 1600, 1602, 1608, 1612, 1620, 1652, 1655, 1657, 1662, 1672, 1674, 1675, 1702, 1703, 1711, 1727, 1729, 1730, 1735, 1737, 1744, 1748, 1752, 1760, 1767, 1771, 1788, 1789, 1790, 1791, 1800, 1803, 1805, 1807, 1808, 1811, 1812, 1813, 1826, 1830, 1835, 1837, 1845, 1846, 1861, 1868, 1878, 1879, 1894, 1897, 1905, 1909, 1913, 1918, 1926, 1937, 1939, 1949, 1952, 1954, 1961, 1962, 1963, 1964, 1967, 1970, 1971, 1972, 1991, 1999, 2001, 2002, 2005, 2007, 2015, 2016, 2029, 2039, 2044, 2050, 2051, 2063, 2064, 2066, 2069, 2079, 2089, 2106, 2119, 2120, 2123, 2124, 2125, 2129, 2130, 2132, 2138, 2149, 2151, 2162, 2164, 2168, 2176, 2179, 2189, 2206, 2210, 2211, 2230, 2236, 2242, 2248, 2250, 2252, 2256, 2258, 2262, 2268, 2274, 2282, 2284, 2285] not in index'

In [201]:
train_index = fold_indices[0][0]

In [203]:
train_index

array([   0,    1,    2, ..., 2861, 2862, 2863])