In this notebook we perform analysis on an obesity dataset and build several obesity prediction models

In [1]:
import pandas as pd

In [2]:
#Importing and previewing the dataset
ob = pd.read_csv(r'/Users/mjones/Desktop/Data Sets/Obesity prediction.csv')
ob.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
#The models won't work using string values. I use the get_dummies function to convert the binary categories into 0 and 1 values
ob = pd.get_dummies(ob, columns=['Gender', 'family_history', 'SMOKE', 'SCC', 'FAVC'])

In [4]:
#Checking the newly formed columns
ob.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CAEC', 'CH2O', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'Obesity', 'Gender_Female', 'Gender_Male',
       'family_history_no', 'family_history_yes', 'SMOKE_no', 'SMOKE_yes',
       'SCC_no', 'SCC_yes', 'FAVC_no', 'FAVC_yes'],
      dtype='object')

In [5]:
#get_dummies() splits the columns into two. I now drop the additional ones that are no longer necessary
ob = ob.drop(columns=['Gender_Male', 'SMOKE_yes', 'family_history_yes', 'SCC_no', 'FAVC_no'])

In [6]:
#Previewing the dataset after the transformations
ob.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC,MTRANS,Obesity,Gender_Female,family_history_no,SMOKE_no,SCC_yes,FAVC_yes
0,21.0,1.62,64.0,2.0,3.0,Sometimes,2.0,0.0,1.0,no,Public_Transportation,Normal_Weight,1,0,1,0,0
1,21.0,1.52,56.0,3.0,3.0,Sometimes,3.0,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,1,0,0,1,0
2,23.0,1.8,77.0,2.0,3.0,Sometimes,2.0,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,0,0,1,0,0
3,27.0,1.8,87.0,3.0,3.0,Sometimes,2.0,2.0,0.0,Frequently,Walking,Overweight_Level_I,0,1,1,0,0
4,22.0,1.78,89.8,2.0,1.0,Sometimes,2.0,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,0,1,1,0,0


Next I need to change the other string columns. Since they have several options, the get_dummies function won't work. I need to explicitly assign new values. I start by using the unique() function to see the values that I will need to change. Then using the map() function to change the values into numeric types

In [7]:
ob['CAEC'].unique()

array(['Sometimes', 'Frequently', 'Always', 'no'], dtype=object)

In [8]:
ob['CALC'].unique()

array(['no', 'Sometimes', 'Frequently', 'Always'], dtype=object)

In [9]:
ob['CAEC']= ob['CAEC'].map({'Sometimes':1, 'Frequently':2, 'Always':3, 'no':4})

In [10]:
ob['CALC']= ob['CALC'].map({'Sometimes':1, 'Frequently':2, 'Always':3, 'no':4})

In [11]:
ob['MTRANS'].unique()

array(['Public_Transportation', 'Walking', 'Automobile', 'Motorbike',
       'Bike'], dtype=object)

In [12]:
ob['MTRANS']= ob['MTRANS'].map({'Public_Transportation':1, 'Walking':2, 'Automobile':3, 'Motorbike':4, 'Bike':5})

In [13]:
ob['Obesity'] = ob['Obesity'].map({'Normal_Weight':1, 'Overweight_Level_I':2, 'Overweight_Level_II':3, 'Obesity_Type_I':4, 'Obesity_Type_II':5, 'Obesity_Type_III':6, 'Insufficient_Weight':7})

In [14]:
#Previewing the transformed dataset
ob.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC,MTRANS,Obesity,Gender_Female,family_history_no,SMOKE_no,SCC_yes,FAVC_yes
0,21.0,1.62,64.0,2.0,3.0,1,2.0,0.0,1.0,4,1,1,1,0,1,0,0
1,21.0,1.52,56.0,3.0,3.0,1,3.0,3.0,0.0,1,1,1,1,0,0,1,0
2,23.0,1.8,77.0,2.0,3.0,1,2.0,2.0,1.0,2,1,1,0,0,1,0,0
3,27.0,1.8,87.0,3.0,3.0,1,2.0,2.0,0.0,2,2,2,0,1,1,0,0
4,22.0,1.78,89.8,2.0,1.0,1,2.0,0.0,0.0,1,1,3,0,1,1,0,0


In [15]:
#I want the Obesity (the predictor) column at the end
column_to_move = ob.pop('Obesity')
ob.insert(16,'Obesity', column_to_move)

In [16]:
ob.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC,MTRANS,Gender_Female,family_history_no,SMOKE_no,SCC_yes,FAVC_yes,Obesity
0,21.0,1.62,64.0,2.0,3.0,1,2.0,0.0,1.0,4,1,1,0,1,0,0,1
1,21.0,1.52,56.0,3.0,3.0,1,3.0,3.0,0.0,1,1,1,0,0,1,0,1
2,23.0,1.8,77.0,2.0,3.0,1,2.0,2.0,1.0,2,1,0,0,1,0,0,1
3,27.0,1.8,87.0,3.0,3.0,1,2.0,2.0,0.0,2,2,0,1,1,0,0,2
4,22.0,1.78,89.8,2.0,1.0,1,2.0,0.0,0.0,1,1,0,1,1,0,0,3


In [17]:
#Previewing the columns in my dataset in preparation to assign the features and the predictor variables
ob.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CAEC', 'CH2O', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'Gender_Female', 'family_history_no', 'SMOKE_no',
       'SCC_yes', 'FAVC_yes', 'Obesity'],
      dtype='object')

Next I assign the features and the predictor variables

In [18]:
features = ['Age', 'Height', 'FCVC', 'NCP', 'CAEC', 'CH2O', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'Gender_Female', 'family_history_no', 'SMOKE_no',
       'SCC_yes', 'FAVC_yes']

In [19]:
predictor = ['Obesity']

In [20]:
X = ob.loc[:, features].values

In [21]:
y = ob.loc[:, predictor].values

Next I import the standardscaler and principal component analysis packages to standardize my features as well as use PCA to minimize the number of features.

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [23]:
X = StandardScaler().fit_transform(X)

I elect to use 9 principal components

In [24]:
pca = PCA(n_components=9)
principalComponents = pca.fit_transform(X)
principalOb = pd.DataFrame(data=principalComponents, columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9'])

In [25]:
principalOb.shape

(2111, 9)

I create a new dataframe called finalOB using the principal components and concatenating the obesity column onto the end

In [40]:
finalOb = pd.concat([principalOb, ob[['Obesity']]], axis=1)

In [27]:
finalOb.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,Obesity
0,1.625618,-0.052671,0.022598,-0.849863,-1.751855,0.379976,-0.192195,-0.216208,-2.233655,1
1,2.275378,-1.093413,3.084155,4.356404,-1.103146,5.234845,3.919024,0.879715,-0.248579,1
2,-0.827578,-1.622884,1.154246,-0.237181,-0.925342,0.356082,-0.399949,-0.738155,-0.911315,1
3,-0.120763,-0.509067,2.6507,1.289046,-0.341985,-0.116386,-1.141974,-1.748218,0.24527,2
4,1.093932,-0.256477,1.391925,-0.777077,1.600446,-0.609013,0.416862,-1.141189,0.182193,3


In [41]:
finalOb['Obesity'].unique()

array([1, 2, 3, 4, 7, 5, 6])

Next I import the models I will be using as well as the train_test_split and accuracy score modules.

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

Below I create and fit the models

In [31]:
clf = DecisionTreeClassifier(random_state=0)
rfc = RandomForestClassifier(random_state=0)
ada = AdaBoostClassifier(random_state=0)

In [32]:
X = finalOb[['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9']]
y = finalOb['Obesity']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3)

In [34]:
clf.fit(X_train, y_train.values.ravel())

In [35]:
rfc.fit(X_train, y_train.values.ravel())

In [36]:
ada.fit(X_train, y_train.values.ravel())

Below I add the predictive models to a list and then use a for-loop to iteratively run each model and display the accuracy score for each.

In [38]:
models = [clf, rfc, ada]

In [39]:
for model in models:
    model.fit(X_train, y_train.values.ravel())
    predictions = model.predict(X_test)
    accuracy_scores = accuracy_score(y_test, predictions)
    print(f"{model} score: {accuracy_scores}")

DecisionTreeClassifier(random_state=0) score: 0.6656151419558359
RandomForestClassifier(random_state=0) score: 0.7665615141955836
AdaBoostClassifier(random_state=0) score: 0.38801261829652994
