# Obesity risk factors classification

In [None]:
# import the nescessary libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import naive_bayes
from sklearn import svm
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
#importing the dataset
file_path = 'ObesityDataSet_raw_and_data_sinthetic.csv'
data = pd.read_csv(file_path)
data.head()

In [None]:
#checking for the data size and types
print(data.dtypes)
print(data.shape)

### Data cleaning and processing

In [None]:
# checking for the null value in dataset
data.isnull().sum()

since there is no null value we can continue to data processing

In [None]:
# radomizing the order in dataset
data = data.sample(frac=1, random_state=45).reset_index(drop=True)

In [None]:
# seperating data into classes and variables
variables = data.iloc[:,: -1]
classes = data.iloc[:, -1]
variables.head()

In [None]:
# since height and weight is a function of body type, we will drop these two variables to focus on the other risk factors
variables = variables.drop('Height', axis=1)
variables = variables.drop('Weight', axis=1)
variables.head()

In [None]:
# encoding variable into binomial and numerical value
var_encoding = {'Gender':                         {'Male': 0, 'Female': 1},
                'family_history_with_overweight': {'yes': 1, 'no': 0},
                'FAVC':                           {'yes': 1, 'no': 0},
                'CAEC':                           {'no': 0, 'Sometimes': 1,
                                                  'Frequently': 2, 'Always': 3},
                'SMOKE':                          {'yes': 1, 'no': 0},
                'SCC':                            {'yes': 1, 'no': 0},
                'CALC':                           {'no': 0, 'Sometimes': 1,
                                                  'Frequently': 2, 'Always': 3}}
variables = variables.replace(var_encoding)
variables.head()

In [None]:
# for MTRANS, we will use one-hot encoding method since their is no linear relationship between each variables
mtrans = pd.get_dummies(variables['MTRANS'])
mtrans.head()

In [None]:
# combining the encoded MTRANS with the rest of the variables
variables = variables.drop('MTRANS', axis=1)
variables = pd.concat([variables, mtrans], axis=1)
variables.head()

In [None]:
# due to the limited data size, we will simplify the classes to make the task more managable for the algorityms
cla_encoding = {'Overweight_Level_I': 'Overweight',
               'Overweight_Level_II': 'Overweight',
               'Obesity_Type_I':      'Obesity',
               'Obesity_Type_II':     'Obesity',
               'Obesity_Type_III':    'Obesity'}
classes = classes.replace(cla_encoding)
classes.head()

## Classification

In [None]:
# creating a train test split
X_train, X_test, y_train, y_test = train_test_split(variables, classes, test_size=0.2, random_state=0, stratify=classes)

### Naive Bayes classifier

In [None]:
# mapping out the effect of parameters to accuracy
training_accuracy = []
testing_accuracy = []
alphas = range(0,51)
for i in alphas:
    classifier = naive_bayes.MultinomialNB(alpha=i, fit_prior=False)
    classifier.fit(X_train, y_train)
    training_accuracy.append(accuracy_score(y_train,
                                            classifier.predict(X_train)))
    testing_accuracy.append(accuracy_score(y_test,
                                           classifier.predict(X_test)))
plt.figure()
plt.plot(alphas, training_accuracy, label='training accuracy')
plt.plot(alphas, testing_accuracy, 
             label='testing accuracy')
plt.title('Testing Accuracy by Parameter')
plt.xlabel('alpha')
plt.ylabel('accuracy')
plt.legend()
plt.grid()

In [None]:
#creating a classifier
nb_classifier = naive_bayes.MultinomialNB(alpha=1, fit_prior=False)

In [None]:
#training the model
nb_classifier.fit(X_train, y_train)

In [None]:
# timing the processing time
%timeit nb_classifier.fit(X_train, y_train)

In [None]:
# testing for in-sample and off-sample accuracy
nb_y_train = nb_classifier.predict(X_train)
nb_y_test = nb_classifier.predict(X_test)
nb_train_acc = accuracy_score(y_train, nb_y_train)
nb_test_acc = accuracy_score(y_test, nb_y_test)
print('The training accuracy is:', nb_train_acc)
print('The testing accuracy is:', nb_test_acc)

In [None]:
# 5-folds crossvalidation
nb_5_folds = cross_val_score(nb_classifier, variables, classes, cv=5)
nb_5_folds_mean = nb_5_folds.mean()
nb_5_folds_var = nb_5_folds.var()
print('the 5-folds crossvalidation accuracy scores are:', nb_5_folds)
print('the 5-folds crossvalidation mean score is:', nb_5_folds_mean)
print('the 5-folds crossvalidation varience is:', nb_5_folds_var)

In [None]:
# creating a confusion matrix
nb_matrix = confusion_matrix(y_test,nb_y_test)
labels = ['Underweight', 'Normalweight', 'Overweight', 'Obesity']
sns.heatmap(nb_matrix.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('true label')
plt.ylabel('predicted label')

### Support Vector Machine Classifier

In [None]:
# mapping out the effect of parameters to accuracy
kernels = ['linear', 'rbf', 'poly']
scores = [1, 2, 3, 4, 5]
kernel_cv = pd.DataFrame(index=scores, columns=kernels)
for i in kernels:
    clf = svm.SVC(kernel=i, random_state=45, gamma=0.1)
    clf.fit(X_train, y_train)
    cv = cross_val_score(clf, variables, classes, cv=5)
    kernel_cv[i] = cv

In [None]:
sns.heatmap(kernel_cv)
plt.title('CV score by kernal')
plt.xlabel('kernels')

In [None]:
#creating a classifier
model = svm.SCV(kernel='rbf')
parameters = [{'C': [0.1, 1, 10, 100]
              'gamma': [0.1, 0.05, 0.1]}]
svm_classifier = GridSearchCV(svm.SVC(), parameters, scoring='accuracy')

In [None]:
#training the model
svm_classifier.fit(X_train, y_train)

In [None]:
# timing the processing time
%timeit svm_classifier.fit(X_train, y_train)

In [None]:
# testing for in-sample and off-sample accuracy
svm_y_train = svm_classifier.predict(X_train)
svm_y_test = svm_classifier.predict(X_test)
svm_train_acc = accuracy_score(y_train, svm_y_train)
svm_test_acc = accuracy_score(y_test, svm_y_test)
print('The training accuracy is:', svm_train_acc)
print('The testing sample accuracy is:', svm_test_acc)

In [None]:
# 5-folds crossvalidation
svm_5_folds = cross_val_score(svm_classifier, variables, classes, cv=5)
svm_5_folds_mean = svm_5_folds.mean()
svm_5_folds_var = svm_5_folds.var()
print('the 5-folds crossvalidation accuracy scores are:', svm_5_folds)
print('the 5-folds crossvalidation mean score is:', svm_5_folds_mean)
print('the 5-folds crossvalidation varience is:', svm_5_folds_var)

In [None]:
# creating a confusion matrix
svm_matrix = confusion_matrix(y_test,svm_y_test)
labels = ['Underweight', 'Normalweight', 'Overweight', 'Obesity']
sns.heatmap(svm_matrix.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('true label')
plt.ylabel('predicted label')

### Random Forest Classifier

In [None]:
# mapping out the effect of parameters to accuracy
acc_df = pd.DataFrame(columns=range(10 , 101, 10), index=range(10 , 201, 10))

for i in range(10 , 201, 10):
    for j in range(10 , 101, 10):
        clf = ensemble.RandomForestClassifier(n_estimators=i, max_depth=j, random_state=30)
        clf.fit(X_train, y_train)
        cv = cross_val_score(clf, variables, classes, cv=5)
        cv_mean = cv.mean()
        acc_df.loc[i, j] = cv_mean

acc_df = acc_df.astype('float')

In [None]:
sns.heatmap(acc_df.T, center=0.845)
plt.title('Cross Validation Accuracy by Parameters')
plt.xlabel('number of trees')
plt.ylabel('tree depth')

In [None]:
#creating a classifier
parameters = [{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
              'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}]
rf_classifier = GridSearchCV(svm.SVC(), parameters, scoring='accuracy')
rf_classifier  = ensemble.RandomForestClassifier(n_estimators=140, max_depth=30, random_state=30)

In [None]:
#training the model
rf_classifier.fit(X_train, y_train)

In [None]:
# timing the processing time
%timeit rf_classifier.fit(X_train, y_train)

In [None]:
# testing for in-sample and off-sample accuracy
rf_y_train = rf_classifier.predict(X_train)
rf_y_test = rf_classifier.predict(X_test)
rf_train_acc = accuracy_score(y_train, rf_y_train)
rf_test_acc = accuracy_score(y_test, rf_y_test)
print('The training is:', rf_train_acc)
print('The testing accuracy is:', rf_test_acc)

In [None]:
# 5-folds crossvalidation
rf_5_folds = cross_val_score(rf_classifier, variables, classes, cv=5)
rf_5_folds_mean = rf_5_folds.mean()
rf_5_folds_var = rf_5_folds.var()
print('the 5-folds crossvalidation accuracy scores are:', rf_5_folds)
print('the 5-folds crossvalidation mean score is:', rf_5_folds_mean)
print('the 5-folds crossvalidation varience is:', rf_5_folds_var)

In [None]:
# creating a confusion matrix
rf_matrix = confusion_matrix(y_test,rf_y_test)
labels = ['Underweight', 'Normalweight', 'Overweight', 'Obesity']
sns.heatmap(rf_matrix.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('true label')
plt.ylabel('predicted label')