# Deep Learning on Obesity Classification

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import collections
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from keras import Model
from keras.layers import Input,LSTM, Dense

from matplotlib import pyplot

In [None]:
#Data 
data = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
data

FileNotFoundError: ignored

In [None]:
#Rename the columns
data.columns = ['Gender', 'Age', 'Height', 'Weight', 'Family History with Overweight',
       'Frequent consumption of high caloric food', 'Frequency of consumption of vegetables', 'Number of main meals', 'Consumption of food between meals', 'Smoke', 'Consumption of water daily', 'Calories consumption monitoring', 'Physical activity frequency', 'Time using technology devices',
       'Consumption of alcohol', 'Transportation used', 'Obesity']

data


# data exploration

In [None]:
data.describe()

In [None]:
obesity_levels= Counter(data['Obesity'])
print(obesity_levels)

In [None]:
fig = plt.figure(figsize=(5,5))
plt.pie([float(obesity_levels[v]) for v in obesity_levels], labels=[str(k) for k in obesity_levels], autopct=None)
plt.title('Weight Category') 
plt.tight_layout()

In [None]:
male = data['Gender'] == 'Male'
male_obesity = Counter(data.loc[male, 'Obesity'])

female = data['Gender'] == 'Female'
female_obesity = Counter(data.loc[female, 'Obesity'])

In [None]:
fig = plt.figure(figsize=(20,8))
plt.subplot(1, 2, 1)
plt.pie([float(male_obesity[v]) for v in male_obesity], labels=[str(k) for k in male_obesity], autopct=None)
plt.title('Weight Category of Male') 
plt.tight_layout()

plt.subplot(1, 2, 2)
plt.pie([float(female_obesity[v]) for v in female_obesity], labels=[str(k) for k in female_obesity], autopct=None)
plt.title('Weight Category of Female') 
plt.tight_layout()

### A bigger proportion of female with a large slice of Obesity Type III in the pie chart below, while Obesity Type II is the most prevalent type of obesity in male. Interestingly, there is also a higher proportion of Insufficient Weight in female compared to male, this could be explained by a heavier societal pressure on women to go on diets.

In [None]:
# visulize the distribution of numerical features
for col in data.select_dtypes('float'):
    plt.figure()
    sns.distplot(data[col])

In [None]:
# confusion matrix
sns.heatmap(data.corr())

In [None]:
data.corr()['Weight'].sort_values()

# Encoding our data 

In [None]:
# identity categorical variables (data type would be 'object')
category = data.dtypes == object

print(category)

# When dtype == object is 'true'
print(category[category])
category_labels = category[category].index
print('Categorical variables:', category_labels)

# When dtype == object is 'false'
false = category[~category]
non_category = false.index
print('Non Categorical variables:', non_category)

In [None]:
# identify categorical variables with more than 2 values to encode it (0,1,2...)
col = [x for x in category_labels]
multiple = [data[x].unique() for x in category_labels]

multi_col = {col: values for col, values in zip(col, multiple) if len(values)>2}
print(multi_col)
print('\n')
print('Categorical variables with more than 2 values/answers:', multi_col.keys())

In [None]:
#Enumerating our columns 
data.columns

def col_no(x):
    d = {}
    d[data.columns[x]] = x
    return(d)

print([col_no(x) for x in range(0, len(data.columns))])

In [None]:
x = data[data.columns[:-1]]
y = data['Obesity']

#x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2)
       

In [None]:
#Encoding our output (y label), there are total 7 categories for the output
output_encoder = LabelEncoder()
y_encoded = output_encoder.fit_transform(y)
print(np.unique(y_encoded, return_counts=True))

In [None]:
#Check if there is missing values
data.isnull().sum()

In [None]:
# Make a pipe line to perform imputation and scaling
Scale_features = ['Age', 'Height', 'Weight']
Scale_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('Scaling', StandardScaler())
])

Ordi_features = ['Consumption of food between meals', 'Consumption of alcohol']
Ordi_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('Ordi', OrdinalEncoder())
])

Noseq_features = ['Gender', 'Family History with Overweight', 'Frequent consumption of high caloric food', 'Smoke', 'Calories consumption monitoring', 'Transportation used']
Noseq__transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('Non-sequenced', OneHotEncoder())
])

Preprocessor = ColumnTransformer(transformers=[
    ('Scale', Scale_transformer, Scale_features),
    ('Ordinal', Ordi_transformer, Ordi_features),
    ('Non-sequenced', Noseq__transformer, Noseq_features)
], remainder = 'passthrough')
    
clf = Pipeline(steps=[('preprocessor', Preprocessor)])

In [None]:
clf.fit(x, y_encoded)

In [None]:
print(x.shape)
print(y_encoded.shape)

In [None]:
# After transformation, there are 25 features in total
trans_x = clf.fit_transform(x)
print(trans_x.shape)

In [None]:
# Column name of first two steps in pipeline

cols = [y for x in [Scale_features, Ordi_features] for y in x]
cols

In [None]:
# CATEGORICAL COLUMNS (NO-NUMERICAL COLUMNS)
#Column names of OneHotEncoder step in pipeline

ohe_cols = clf.named_steps['preprocessor'].transformers_[2][1]\
    .named_steps['Non-sequenced'].get_feature_names(Noseq_features)
ohe_cols = [x for x in ohe_cols]
ohe_cols

In [None]:
# COLUMNS (NUMERICAL COLUMNS)
non_category

In [None]:
transformed_x = pd.DataFrame(trans_x, columns= ['Age', 'Height',
 'Weight',
 'Consumption of food between meals',
 'Consumption of alcohol','Gender_Female',
 'Gender_Male',
 'Family History with Overweight_no',
 'Family History with Overweight_yes',
 'Frequent consumption of high caloric food_no',
 'Frequent consumption of high caloric food_yes',
 'Smoke_no',
 'Smoke_yes',
 'Calories consumption monitoring_no',
 'Calories consumption monitoring_yes',
 'Transportation used_Automobile',
 'Transportation used_Bike',
 'Transportation used_Motorbike',
 'Transportation used_Public Transportation',
 'Transportation used_Walking', 'Frequency of consumption of vegetables',
 'Number of main meals',
 'Consumption of water daily',
 'Physical activity frequency',
 'Time using technology devices'])

In [None]:
transformed_x

In [None]:
#Train and test data split 80-20
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(transformed_x, y_encoded, test_size=0.2)

In [None]:
print("x_train size is", x_train.shape)
print("x_test size is", x_test.shape)

In [None]:
y_train

# feature selection

Acording to the following feature selection, Weight, Age, Height, Frequency of consumption of vegetables are the most 4 important features

In [None]:
# Feature selection using tree based method
from sklearn.ensemble import ExtraTreesClassifier    

model = ExtraTreesClassifier()
model.fit(x_train,y_train)
print(model.feature_importances_)
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x_train.columns)
feat_importances.nlargest(25).plot(kind='bar')
plt.show()

In [None]:
# Keep some import features
#x_train=x_train[['Age','Height','Weight','Frequency of consumption of vegetables',
                 'Time using technology devices','Number of main meals','Consumption of alcohol']]
#x_test=x_test[['Age','Height','Weight','Frequency of consumption of vegetables',
               'Time using technology devices', 'Number of main meals','Consumption of alcohol']]

In [None]:
print(x_train.shape)

# Model Selection

In [None]:
#importing necesary libraries in python
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


In [None]:
y_train_model=to_categorical(y_train)
y_test_model=to_categorical(y_test)

In [None]:
y_train_model

In [None]:
# Define Artificial Neural Networks (ANN)
model = Sequential()
model.add(Dense(64, input_dim=7, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(7, activation='relu'))

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
# fit the model
history=model.fit(x_train,y_train_model, validation_data=(x_test, y_test_model), epochs = 500)

In [None]:
# evaluate the model
train_acc = model.evaluate(x_train, y_train_model, verbose=0)
test_acc = model.evaluate(x_test, y_test_model, verbose=0)
print('Train loss:',train_acc[0],'Train accuracy:',train_acc[1])
print('Test loss:',test_acc[0],'Test accuracy:',test_acc[1])

In [None]:
# plot loss during training
pyplot.figure(figsize=(10,10))
pyplot.subplot(211)
plt.xlabel('Epochs')
plt.ylabel('Loss')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()

In [None]:
# plot accuracy during training
pyplot.figure(figsize=(10,10))
pyplot.subplot(212)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
pyplot.plot(history.history['accuracy'], label='train')
pyplot.plot(history.history['val_accuracy'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
# prediction
yhat = model.predict_classes(x_test, verbose=0)

In [None]:
yhat

In [None]:
y_test

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, yhat))