In [None]:
#Import packages and functions
from IPython.display import display
%matplotlib inline
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
import copy
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
import xgboost as xgb


#Import dataset and slicing
dataset = pd.read_csv('dataset.csv', sep=',')
countries = dataset.country_name[:79]
continents = dataset.continent[:79]
foods_ori = dataset.iloc[:79,4:] 

#Correlation analysys
fc = foods_ori.corr(method='spearman', min_periods=1).abs()
corr_matrix = fc.abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
foods_ori = dataset.iloc[:,4:] 
food_clean = foods_ori.drop(to_drop, axis=1)
to_drop_pair = [] 
for food_col in to_drop:
    food_col_pair = upper.idxmax()[food_col]
    to_drop_pair.append(food_col_pair)
    food_clean [str(food_col)+'+'+str(food_col_pair)] = (foods_ori[food_col] + foods_ori[food_col_pair])/2
food_clean = food_clean.drop(to_drop_pair, axis=1)
print('Columns to remove: ')
print(to_drop, to_drop_pair)


#Data slicing after correlation analysys
foods = food_clean.iloc[:79,:] 
prevalencias = todo.prevalence_lancet[:79]

#Standarization
standard_foods = StandardScaler().fit(foods)
foods_norm = standard_foods.transform(foods)
prevalences = prevalencias.values.ravel()
prevalences = (prevalences - np.mean(prevalences))/np.std(prevalences)

#Principal Component Analysis
pca = PCA(n_components=foods_norm.shape[1])
pca_result = pca.fit_transform(foods_norm)
suma = np.cumsum(pca.explained_variance_ratio_)


#LOOCV function
def LOOCV(X,y,model,n_countries):
	ytest = [None]*n_countries
	ypred_all = [None]*n_countries
	for i in range(0,n_countries):
	    X_test = X[i,:]
	    y_test = y[i]
	    X_train = X[np.arange(len(X))!=i]
	    y_train = y[np.arange(len(y))!=i]
	    Reg = model.fit(X_train, y_train)
	    Reg_predicted=Reg.predict(X_test.reshape(1,-1))
	    ytest[i] = y_test
	    ypred_all[i] = Reg_predicted

	err = [None]*len(ypred_all)
	for i in range(len(ypred_all)):
	    err[i] = np.square((ypred_all[i]-ytest[i]))

    return err, ytest, ypred_all

n_countries=79

#Support Vector Machine
svr_mod = LOOCV(X=foods,y=prevalences,model=svm.SVR(C=0.1,gamma=0.01,kernel='rbf'),n_countries=79)
err_svr = svr_mod[0]
print('SVR')
print(f'MSE1: {np.mean(err_svr)}')
print(f'RMSE: {np.sqrt(np.mean(err_svr))}')

#Random Forest
rf = LOOCV(X=foods,y=prevalences,model=RandomForestRegressor(max_depth = 9,  
								n_estimators=100).fit(X_train, y_train),n_countries=79)
err_rf = rf[0]
print('RF')
print(f'MSE1: {np.mean(err_rf)}')
print(f'RMSE: {np.sqrt(np.mean(err_rf))}')

#XGBoost
xgboost = LOOCV(X=foods,y=prevalences,model=xgb.XGBRegressor(colsample_bytree=0.35, max_depth=5, n_estimators=1250,
                          			min_child_weight=5,gamma=0,subsample=0.9,reg_alpha=0.0001,
                          			reg_lambda=1,learning_rate=0.1).fit(X_train, y_train),n_countries=79)
err_xgb = xgboost[0]
print('XGBr')
print(f'MSE1: {np.mean(err_xgb)}')
print(f'RMSE: {np.sqrt(np.mean(err_xgb))}')

#Variable Importance List
rfReg = RandomForestRegressor(max_depth = 9,n_estimators=1000, max_features=20) 
rf_vil  = [None]*250
for i in range(0,250):
    rf = rfReg.fit(foods,prevalences)
    rf_vil[i] = rf.feature_importances_
media_rf = np.mean(rf_vil, axis = 0)
df_rf = pd.DataFrame({'variables':food_clean.columns, 'RF':media_rf}).sort_values(by=['RF'], ascending=True)


#Prediction of Top 5,10,15,20
df_model = df_rf
largo = len(df_model)
no_rf_top5 = df_model.iloc[:(largo-5),1].values  
no_rf_top10 = df_model.iloc[:(largo-10),1].values 
no_rf_top15 = df_model.iloc[:(largo-15),1].values
no_rf_top20 = df_model.iloc[:(largo-20),1].values
foods_df = pd.DataFrame(foods)
foods_df.columns = food_clean.columns
top5 = foods_df.drop(no_rf_top5, axis=1)
top10 = foods_df.drop(no_rf_top10, axis=1)
top15 = foods_df.drop(no_rf_top15, axis=1)
top20 = foods_df.drop(no_rf_top20, axis=1)
scaler.fit(top5)
top5_normalised = scaler.transform(top5)
scaler.fit(top10)
top10_normalised = scaler.transform(top10)
scaler.fit(top15)
top15_normalised = scaler.transform(top15)
scaler.fit(top20)
top20_normalised = scaler.transform(top20)

#Prediction Top5
top_5 = LOOCV(X=top5_normalised,y=prevalences,model= RandomForestRegressor(max_depth = 9,n_estimators=100),n_countries=79)
err5 = top_5[0]

#Prediction Top10
top_10 = LOOCV(X=top10_normalised,y=prevalences,model= RandomForestRegressor(max_depth = 9,n_estimators=100),n_countries=79)
err10 = top_10[0]

#Prediction Top15
top_15 = LOOCV(X=top15_normalised,y=prevalences,model= RandomForestRegressor(max_depth = 9,n_estimators=100),n_countries=79)
err15 = top_15[0]

#Prediction Top20
top_20 = LOOCV(X=top20_normalised,y=prevalences,model= RandomForestRegressor(max_depth = 9,n_estimators=100),n_countries=79)
err20 = top_20[0]