In [None]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# 1: Download and load the data
df = pd.read_csv('ObesityDataSet.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [None]:
# 2: Add intercept column with all values=1
df.insert(0, 'intercept', 1)
df.head()

Unnamed: 0,intercept,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,1,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,1,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,1,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,1,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [None]:
# 3: Feature transformation:
#  - Convert 'Gender' column to numbers where 'Female' is 1 and 'Male' is 0
#  - Convert yes/no columns ['family_history_with_overweight','FAVC','SMOKE','SCC'] to 1/0
#  - One-Hot encode 'MTRANS', and 'NObeyesdad' columns.
#  - Label encode 'CAEC', and 'CALC' columns

lb = pp.LabelEncoder() # making these models for subtasks c and d
oh = pp.OneHotEncoder()

# subtask a: manually labelencoding gender
gender = {'Female' : 1, 'Male' : 0}
df['Gender'] = [gender[i] for i in df['Gender']]

# subtask b: manually labelencoding yes/no
yes_no = {'yes' : 1, 'no' : 0}
for col in ['family_history_with_overweight','FAVC','SMOKE','SCC']:
    df[col] = [yes_no[i] for i in df[col]]

df.head()

Unnamed: 0,intercept,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,1,1,21.0,1.62,64.0,1,0,2.0,3.0,Sometimes,0,2.0,0,0.0,1.0,no,Public_Transportation,Normal_Weight
1,1,1,21.0,1.52,56.0,1,0,3.0,3.0,Sometimes,1,3.0,1,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,1,0,23.0,1.8,77.0,1,0,2.0,3.0,Sometimes,0,2.0,0,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,1,0,27.0,1.8,87.0,0,0,3.0,3.0,Sometimes,0,2.0,0,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,1,0,22.0,1.78,89.8,0,0,2.0,1.0,Sometimes,0,2.0,0,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [None]:
# subtask c: onehotencode MTRANS and NObeyesdad
def OneHotEncode(column: pd.Series):
    unilist = column.unique()
    encoded = pd.DataFrame(columns=unilist)
    for type, index in zip(column, range(len(column))):
        encoded.loc[index, type] = 1
    encoded.fillna(0, inplace=True)
    return encoded

# making a manual onehotencoder because sklearn is weird to figure out
y_df = OneHotEncode(df['NObeyesdad'])
y_lb = lb.fit_transform(df['NObeyesdad'])
m_trans_df = OneHotEncode(df['MTRANS'])

In [None]:
# subtask d: labelencoding ['CAEC', 'CALC']
for col in ['CAEC', 'CALC']:
    df[col] = lb.fit_transform(df[col])

In [None]:
# making a new data with onehotended m_trans and nobeyedad
X_df = pd.concat([df.iloc[:, :-2], m_trans_df], axis=1)
X_df.head()

Unnamed: 0,intercept,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,...,CH2O,SCC,FAF,TUE,CALC,Public_Transportation,Walking,Automobile,Motorbike,Bike
0,1,1,21.0,1.62,64.0,1,0,2.0,3.0,2,...,2.0,0,0.0,1.0,3,1,0,0,0,0
1,1,1,21.0,1.52,56.0,1,0,3.0,3.0,2,...,3.0,1,3.0,0.0,2,1,0,0,0,0
2,1,0,23.0,1.8,77.0,1,0,2.0,3.0,2,...,2.0,0,2.0,1.0,1,1,0,0,0,0
3,1,0,27.0,1.8,87.0,0,0,3.0,3.0,2,...,2.0,0,2.0,0.0,1,0,1,0,0,0
4,1,0,22.0,1.78,89.8,0,0,2.0,1.0,2,...,2.0,0,0.0,0.0,2,1,0,0,0,0


In [None]:
# 4: Since the features have relatively different ranges, normalize the dataset
ss = pp.StandardScaler
for column in X_df.columns:
    X_df[column] = pp.minmax_scale(X_df[column])
X_df.head()

Unnamed: 0,intercept,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,...,CH2O,SCC,FAF,TUE,CALC,Public_Transportation,Walking,Automobile,Motorbike,Bike
0,0.0,1.0,0.148936,0.320755,0.186567,1.0,0.0,0.5,0.666667,0.666667,...,0.5,0.0,0.0,0.5,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.148936,0.132075,0.126866,1.0,0.0,1.0,0.666667,0.666667,...,1.0,1.0,1.0,0.0,0.666667,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.191489,0.660377,0.283582,1.0,0.0,0.5,0.666667,0.666667,...,0.5,0.0,0.666667,0.5,0.333333,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.276596,0.660377,0.358209,0.0,0.0,1.0,0.666667,0.666667,...,0.5,0.0,0.666667,0.0,0.333333,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.170213,0.622642,0.379104,0.0,0.0,0.5,0.0,0.666667,...,0.5,0.0,0.0,0.0,0.666667,1.0,0.0,0.0,0.0,0.0


In [None]:
# 5: Define X matrix (independent features) and y matrix (target features) as numpy arrays
X_arr = np.array(X_df)
y_arr = np.array(y_df)
# 6: Print the shape and datatype of both X and y
print(f"Shape of X matrix: {X_arr.shape}, dtype: {X_arr.dtype}")
print(f"Shape of y matrix: {y_arr.shape}, dtype: {y_arr.dtype}")

Shape of X matrix: (2111, 21), dtype: float64
Shape of y matrix: (2111, 7), dtype: int64


In [None]:
# 7: Split the dataset into 80% for training and rest 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X_arr, y_lb, test_size=0.2,
                                                    random_state = 42)

In [None]:
model = LogisticRegression(multi_class='multinomial', verbose=1, max_iter=500)
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


LogisticRegression(max_iter=500, multi_class='multinomial', verbose=1)

In [None]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [None]:
print(f"Accuracy on test data: {accuracy_score(y_test, y_pred_test)*100:.3f}% (up to 3 decimals)")
print(f"Accuracy on train data: {accuracy_score(y_train, y_pred_train)*100:.3f}% (up to 3 decimals)")

Accuracy on test data: 72.813% (up to 3 decimals)
Accuracy on train data: 73.815% (up to 3 decimals)


In [None]:
print(f"Confusion matrix for test: \n{confusion_matrix(y_test, y_pred_test)}\n\n") 
print(f"Confusion matrix for train: \n{confusion_matrix(y_train, y_pred_train)}")

Confusion matrix for test: 
[[52  2  0  0  0  2  0]
 [17 23  2  0  0 10 10]
 [ 0  0 54 13  3  4  4]
 [ 0  0  1 57  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 2  6  3  0  0 35 10]
 [ 0  1 13  6  1  5 24]]


Confusion matrix for train: 
[[195  19   0   0   0   1   1]
 [ 60  98   3   0   0  33  31]
 [  0   3 193  44  13   4  16]
 [  0   1   4 233   0   0   1]
 [  0   0   1   1 259   0   0]
 [  2  25  21   3   4 141  38]
 [  0  10  53  13   2  35 127]]


In [4]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.

