# Initializing

In [1]:
#Active environment should be aws_env
!conda info | grep 'active env'

     active environment : aws_env
    active env location : /home/hassan101/anaconda3/envs/aws_env


In [2]:
#Get AWS credentials from environment
import os
aws_akid = os.environ['AWS_KID']
aws_sak = os.environ['AWS_AK']

In [3]:
#Importing libraries
import boto3
import pandas as pd
import io
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Getting processed data from S3 bucket

In [4]:
#Authentication
res_s3 = boto3.resource('s3', aws_access_key_id=aws_akid, aws_secret_access_key= aws_sak)
client_s3 = boto3.client('s3', aws_access_key_id=aws_akid, aws_secret_access_key= aws_sak) 

In [5]:
# List all objects in bucket
bucket_name = 'reg-dataset-processed'

response = client_s3.list_objects_v2(Bucket=bucket_name)
for obj in response['Contents']:
    print(obj)

{'Key': 'lifexp-processed.csv', 'LastModified': datetime.datetime(2023, 2, 9, 9, 16, 24, tzinfo=tzutc()), 'ETag': '"9e138a0af6f65a9b4338da770613527c"', 'Size': 98812, 'StorageClass': 'STANDARD'}


In [6]:
response = client_s3.get_object(Bucket=bucket_name, Key="lifexp-processed.csv")

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

Successful S3 get_object response. Status - 200


In [7]:
df.head(20)

Unnamed: 0,HIV/AIDS,Income composition of resources,Adult mortality,Under-five deaths,Thinness 5-9 years,Status,Polio,Total expenditure
0,0.1,0.479,263,83,17.3,Developing,6,8.16
1,0.1,0.476,271,86,17.5,Developing,58,8.18
2,0.1,0.47,268,89,17.7,Developing,62,8.13
3,0.1,0.463,272,93,18.0,Developing,67,8.52
4,0.1,0.454,275,97,18.2,Developing,68,7.87
5,0.1,0.448,279,102,18.4,Developing,66,9.2
6,0.1,0.434,281,106,18.7,Developing,63,9.42
7,0.1,0.433,287,110,18.9,Developing,64,8.33
8,0.1,0.415,295,113,19.1,Developing,63,6.73
9,0.1,0.405,295,116,19.3,Developing,58,7.43


In [8]:
df.dtypes

HIV/AIDS                           float64
Income composition of resources    float64
Adult mortality                      int64
Under-five deaths                    int64
Thinness 5-9 years                 float64
Status                              object
Polio                                int64
Total expenditure                  float64
dtype: object

# Feature engineering

We need to perform these things:
- Seperate feature and target values
- OHE
- Train/Test split
- Pipeline:
    - Scaling
    - ML dev

In [16]:
#Seperate feature and labels
X=df.iloc[:,:-1]
y=df.iloc[:,[-1]]
print(X.shape)
print(y.shape)

(2556, 7)
(2556, 1)


In [17]:
#Perform OHE
X_ohe = pd.get_dummies(X)
print(X_ohe.shape)
X_ohe.head()

(2556, 8)


Unnamed: 0,HIV/AIDS,Income composition of resources,Adult mortality,Under-five deaths,Thinness 5-9 years,Polio,Status_Developed,Status_Developing
0,0.1,0.479,263,83,17.3,6,0,1
1,0.1,0.476,271,86,17.5,58,0,1
2,0.1,0.47,268,89,17.7,62,0,1
3,0.1,0.463,272,93,18.0,67,0,1
4,0.1,0.454,275,97,18.2,68,0,1


# Training via pipelienes

In [None]:
#Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=1)

In [None]:
#Setup MLDev pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

mldev_pipeline = {
    'linreg': make_pipeline(StandardScaler(), LinearRegression()),
    'rf': make_pipeline(StandardScaler(), RandomForestRegressor()),
    'gb': make_pipeline(StandardScaler(), GradientBoostingRegressor())
}

for model_name, pipeline in mldev_pipeline.items():
    print(model_name)
    print(pipeline)

In [None]:
#These can also be viewed visually
mldev_pipeline['linreg']

In [None]:
#Check the parameters for these models
print('LinReg Params:',LinearRegression().get_params())
print('RF Params:',RandomForestRegressor().get_params())
print('GB Params:',GradientBoostingRegressor().get_params())

In [None]:
#Create a grid for hyperparameters tuning
hpgrid = {

    'linreg':{
    },
    
    'rf':{
        'randomforestregressor__n_estimators':[100, 200, 300],
        'randomforestregressor__max_depth':[5, 10, 'None']
    },

    'gb':{
        'gradientboostingregressor__n_estimators':[100,200,300],
        'gradientboostingregressor__learning_rate':[0.1, 0.2],
    }
}

In [None]:
#Training the models by looping across pipeline
from sklearn.model_selection import GridSearchCV

trained_models = {}
for model_name, pipeline in mldev_pipeline.items():
    #Train using CV class
    print(f'Training model: {model_name}')
    model = GridSearchCV(pipeline, hpgrid[model_name], n_jobs = -1, cv = 10) #n_jobs = -1 will use all processors for parallel computing
    model.fit(X_train, y_train.values.ravel())
    trained_models[model_name] = model

trained_models

In [None]:
# Check metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

for model_name, model in trained_models.items():
    y_hat = model.predict(X_test)
    print(f'Metrics for {model_name}:',
    'MAE=', mean_absolute_error(y_test,y_hat),
    'RMSE=', mean_squared_error(y_test,y_hat, squared=False),
    'R2=', r2_score(y_test,y_hat)    
    )

In [None]:
#Checking parameters for the best model
trained_models['rf'].get_params()

In [None]:
# Saving best model
import pickle
with open('bestmodel.pkl', 'wb') as f:
    pickle.dump(trained_models['rf'], f)

In [None]:
#If we need to reload the model in future, we can use this code
with open('bestmodel.pkl', 'rb') as f:
    reloaded_model = pickle.load(f)

reloaded_model

# Training from scratch

In [18]:
#Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=1)

In [None]:
#Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rfreg = RandomForestRegressor()
rfreg.fit(scaler.transform(X_train), y_train.values.ravel())
scores = cross_val_score(rfreg, scaler.transform(X_train), y_train.values.ravel(), cv=10)
scores

In [None]:
# Check metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
y_hat = rfreg.predict(scaler.transform(X_test))
print('MAE=', mean_absolute_error(y_test,y_hat),
'RMSE=', mean_squared_error(y_test,y_hat, squared=False),
'R2=', r2_score(y_test,y_hat)
)

In [None]:
r2_adj = 1 - (1-r2_score(y_test,y_hat))*(len(y_hat)-1)/(len(y_hat)-len(X_test.columns)-1)
r2_adj

In [None]:
rfreg.get_params()

In [None]:
# Saving model
import pickle
#with open('model_fromscratch.pkl', 'wb') as f:
#    pickle.dump(rfreg, f)

pickle.dump(rfreg, open('model_fromscratch.pkl', 'wb'))

In [None]:
# Saving standardization params
pickle.dump(scaler, open('scaling.pkl','wb'))

In [None]:
#If above training does not work, check optimal train test split by this:

for i in range(0,10):
    print('----------------------------------------------------------')
    print('Random state for split=', i)
    X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.3, random_state=i)

    scaler = StandardScaler()
    scaler.fit(X_train)

    rfreg = RandomForestRegressor()
    rfreg.fit(scaler.transform(X_train), y_train.values.ravel())
    scores = cross_val_score(rfreg, scaler.transform(X_train), y_train.values.ravel(), cv=10)
    print(scores)
    from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
    y_hat = rfreg.predict(scaler.transform(X_test))
    print('MAE=', mean_absolute_error(y_test,y_hat),
    'RMSE=', mean_squared_error(y_test,y_hat, squared=False),
    'R2=', r2_score(y_test,y_hat)
    )

# Plotting the predictions

In [None]:
#Plotting parity plot
import matplotlib.pyplot as plt
%matplotlib inline

plt.scatter(y_test, y_hat)
plt.plot(y_test,y_test,'k-') 
plt.xlabel('True Life Expectancy')
plt.ylabel('Predicted Life Expectancy')

In [None]:
#Plotting residuals
import seaborn as sns

residuals = y_test.values.ravel() - y_hat
sns.displot(residuals, kind = 'kde')
plt.xlabel('Residuals')

In [None]:
# Plotting scatter plot for residuals and predictions
plt.scatter(y_hat, residuals)
plt.xlabel('Predicted Life Expectancy')
plt.ylabel('Residuals')

# Testing for app API (SINGLE SAMPLE ONLY)

In [110]:
#Testing with sample data

print('Actual label for data:', y_train.iloc[0]['Total expenditure'])
print(f'These are the features with data shape:{X_train.iloc[0:1,:].shape}')
sample = X_train.iloc[0:1,:]
sample.to_csv('sample.csv', index=True)
sample.head()

Actual label for data: 2.37
These are the features with data shape:(1, 8)


Unnamed: 0,HIV/AIDS,Income composition of resources,Adult mortality,Under-five deaths,Thinness 5-9 years,Polio,Status_Developed,Status_Developing
1077,0.1,0.624,213,237,1.9,79,0,1


In [111]:
#Predicting with script (passing without additional header)
import pickle
import json
import numpy as np

model = pickle.load( open('model_fromscratch.pkl', 'rb') )
scaler = pickle.load( open('scaling.pkl', 'rb') )

#Importing sample data and preparing data in form of JSON
sample = pd.read_csv('sample.csv', index_col=0)
dic = sample.to_dict('list') #This will only use column-values pair for dic, index will be ignored. To use index as header for JSON later, use 'index' as argument
data_as_json=json.dumps(dic)
data_as_json

#Loading JSON data on the app, and transforming for predictions
data_as_dic = json.loads(data_as_json)
data_as_array = np.array(list(data_as_dic.values())).reshape(1,-1)
print('Data shape:', data_as_array.shape)
data_sc=scaler.transform(data_as_array)
output=model.predict(data_sc)
print('Predicted value:', output[0])

Data shape: (1, 8)
Predicted value: 2.5392000000000023




In [70]:
#Predicting with script (passing without additional header) - Manual input
import pickle
import json
import numpy as np

model = pickle.load( open('model_fromscratch.pkl', 'rb') )
scaler = pickle.load( open('scaling.pkl', 'rb') )

#Preparing data in form of JSON
dic = {
		"HIV/AIDS": 0.1,
		"Income composition of resources": 0.624,
		"Adult mortality": 213.0,
		"Under-five deaths": 237.0,
		"Thinness 5-9 years": 1.9,
		"Polio": 79.0,
		"Status_Developed": 0.0,
		"Status_Developing": 1.0
	}
data_as_json=json.dumps(dic)
data_as_json

#Loading JSON data on the app, and transforming for predictions
data_as_dic = json.loads(data_as_json)
data_as_array = np.array(list(data_as_dic.values())).reshape(1,-1)
print('Data shape:', data_as_array.shape)
data_sc=scaler.transform(data_as_array)
output=model.predict(data_sc)
print('Predicted value:', output[0])

Data shape: (1, 8)
Predicted value: 2.5392000000000023




In [54]:
#Predicting with script (passing with additional "data"" header) - Manual input
import pickle
import json
import numpy as np

model = pickle.load( open('model_fromscratch.pkl', 'rb') )
scaler = pickle.load( open('scaling.pkl', 'rb') )

#Preparing data in form of JSON
dic = {
    "data":{
		"HIV/AIDS": 0.1,
		"Income composition of resources": 0.624,
		"Adult mortality": 213.0,
		"Under-five deaths": 237.0,
		"Thinness 5-9 years": 1.9,
		"Polio": 79.0,
		"Status_Developed": 0.0,
		"Status_Developing": 1.0
	}
}
data_as_json=json.dumps(dic['data'])
data_as_json

#Loading JSON data on the app, and transforming for predictions
data_as_dic = json.loads(data_as_json)
data_as_array = np.array(list(data_as_dic.values())).reshape(1,-1)
print('Data shape:', data_as_array.shape)
data_sc=scaler.transform(data_as_array)
output=model.predict(data_sc)
print('Predicted value:', output[0])

Data shape: (1, 8)
Predicted value: 2.5392000000000023


