In [47]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer as knn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
data = pd.read_csv(r'train.csv')
data_pred=pd.read_csv(r'test.csv')

In [None]:
!pip install -U pandas-profiling

In [None]:
from ydata_profiling import ProfileReport
from ydata_profiling.utils.cache import cache_file

In [None]:
profile = ProfileReport(
    data, title="Obesity Dataset", html={"style": {"full_width": True}}, sort=None
)

In [None]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
data.drop('CAEC',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('CAEC',axis=1,inplace=True)


In [None]:
data

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,CALC,NObeyesdad
0,0.105699,-0.002828,-0.235713,-0.836279,0.314684,1.206594,-1.171141,0.597438,-0.471288,6
1,-1.027052,-1.606291,-1.170931,-0.836279,0.338364,-0.048349,0.021775,0.636513,1.639846,1
2,-1.027052,0.128451,-1.430012,-1.060332,-1.913423,-0.195644,-0.138022,1.755239,1.639846,0
3,-0.507929,0.120090,1.644770,1.039171,0.338364,-0.584035,0.579896,0.271455,-0.471288,4
4,1.371197,2.450367,0.224054,0.438397,-1.119801,-0.081469,1.176486,0.523111,-0.471288,6
...,...,...,...,...,...,...,...,...,...,...
20753,0.227725,0.760293,0.996987,0.888355,0.338364,0.201151,0.416056,-0.697686,-0.471288,3
20754,-1.027052,0.111729,-1.436296,1.039171,1.756085,-1.691863,1.214691,0.636513,-0.471288,0
20755,-0.657669,1.366537,0.670717,-0.071439,0.338364,-0.048349,0.210303,0.966092,1.639846,3
20756,1.760067,-0.002805,-0.165574,0.422594,-1.119801,0.189694,-1.171141,0.593055,1.639846,6


In [None]:
data.dtypes

id                                  int64
Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

In [None]:
data.drop('family_history_with_overweight',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('family_history_with_overweight',axis=1,inplace=True)


In [None]:
data=data.drop_duplicates()

In [9]:
class Utils_Suite():
    def __init__(self,data):
        self.data=data
    def compute_correlation(self,threshold=0.3):
        matrix=self.data.corr(numeric_only=True)
        x=matrix[(matrix["NObeyesdad"]<threshold)&(matrix["NObeyesdad"]>-threshold)]["NObeyesdad"]
        return x
    def compute_mutual_information(self,thresh=0.1):
        enc = OrdinalEncoder()
        df_encoded = enc.fit_transform(self.data)
        mi_scores = mutual_info_regression(df_encoded, self.data['NObeyesdad'])
        mi_scores_df = pd.DataFrame(mi_scores, index=self.data.columns, columns=['Score'])
        return mi_scores_df[mi_scores_df['Score']<thresh]
    def compute_vif(self):
        x=self.data.iloc[:,:-1]
        y=self.data.iloc[:,-1]
        x=pd.DataFrame(x)

        x['intercept']=1
        vif=pd.DataFrame()
        vif['variable']=x.columns
        vif['vif']=[variance_inflation_factor(x.values,i)for i in range(x.shape[1])]
        return vif

In [48]:
data=data.drop('NCP',axis=1)

In [None]:
def one_hot_encoding(data):
        z=(data.dtypes=='object')
        k=pd.DataFrame(z)
        obj_list=list(k[k[0]==True].index)
        print(obj_list)
        for i in obj_list:
            dummy=pd.get_dummies(data[i],prefix=i,drop_first=True)
            #print(dummy)
            data=data.drop(i,axis=1)
            data=data.join(dummy)
        return data

In [None]:
data=one_hot_encoding(data)

['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']


In [None]:
data.columns

Index(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE',
       'Gender_Male', 'family_history_with_overweight_yes', 'FAVC_yes',
       'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no', 'SMOKE_yes', 'SCC_yes',
       'CALC_Sometimes', 'CALC_no', 'MTRANS_Bike', 'MTRANS_Motorbike',
       'MTRANS_Public_Transportation', 'MTRANS_Walking',
       'NObeyesdad_Normal_Weight', 'NObeyesdad_Obesity_Type_I',
       'NObeyesdad_Obesity_Type_II', 'NObeyesdad_Obesity_Type_III',
       'NObeyesdad_Overweight_Level_I', 'NObeyesdad_Overweight_Level_II'],
      dtype='object')

In [27]:
data

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,-1.731967,1.004152,0.105699,-0.002828,-0.235713,0.469099,0.30588,-0.836279,0.314684,0.337845,-0.109287,1.206594,-0.185009,-1.171141,0.597438,-0.471288,0.429319,6
1,-1.731800,-0.995866,-1.027052,-1.606291,-1.170931,0.469099,0.30588,-0.836279,0.338364,-1.889204,-0.109287,-0.048349,-0.185009,0.021775,0.636513,1.639846,-2.182324,1
2,-1.731634,-0.995866,-1.027052,0.128451,-1.430012,0.469099,0.30588,-1.060332,-1.913423,0.337845,-0.109287,-0.195644,-0.185009,-0.138022,1.755239,1.639846,0.429319,0
3,-1.731467,-0.995866,-0.507929,0.120090,1.644770,0.469099,0.30588,1.039171,0.338364,0.337845,-0.109287,-0.584035,-0.185009,0.579896,0.271455,-0.471288,0.429319,4
4,-1.731300,1.004152,1.371197,2.450367,0.224054,0.469099,0.30588,0.438397,-1.119801,0.337845,-0.109287,-0.081469,-0.185009,1.176486,0.523111,-0.471288,0.429319,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,1.731300,1.004152,0.227725,0.760293,0.996987,0.469099,0.30588,0.888355,0.338364,0.337845,-0.109287,0.201151,-0.185009,0.416056,-0.697686,-0.471288,0.429319,3
20754,1.731467,1.004152,-1.027052,0.111729,-1.436296,-2.131745,0.30588,1.039171,1.756085,-1.889204,-0.109287,-1.691863,-0.185009,1.214691,0.636513,-0.471288,0.429319,0
20755,1.731634,1.004152,-0.657669,1.366537,0.670717,0.469099,0.30588,-0.071439,0.338364,0.337845,-0.109287,-0.048349,-0.185009,0.210303,0.966092,1.639846,0.429319,3
20756,1.731800,1.004152,1.760067,-0.002805,-0.165574,0.469099,0.30588,0.422594,-1.119801,0.337845,-0.109287,0.189694,-0.185009,-1.171141,0.593055,1.639846,-2.182324,6


In [49]:
data.drop('Gender',axis=1,inplace=True)

In [15]:
def get_all_Null(data,dtype=""):
        x=data.isna().sum()>0
        l=[]
        for i in  list(x.index):
            thresh=data[i].isna().sum()/len(data)
            if(x[i]==True and (data[i].dtypes==dtype) ):
                print(i,data[i].isna().sum())
                l+=[i]
        return l

In [None]:
data.dtypes

id                                  int64
Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

In [18]:
get_all_Null(data,'int64')

[]

In [None]:
data['Height'].var()

0.007623368876473514

In [None]:
data['FCVC'].var()

0.28432160024391373

In [None]:
data

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FCVC,NCP,CAEC,CH2O,FAF,TUE,CALC,NObeyesdad
0,0.105699,-0.002828,-0.235713,0.469099,-0.836279,0.314684,0.337845,1.206594,-1.171141,0.597438,-0.471288,6
1,-1.027052,-1.606291,-1.170931,0.469099,-0.836279,0.338364,-1.889204,-0.048349,0.021775,0.636513,1.639846,1
2,-1.027052,0.128451,-1.430012,0.469099,-1.060332,-1.913423,0.337845,-0.195644,-0.138022,1.755239,1.639846,0
3,-0.507929,0.120090,1.644770,0.469099,1.039171,0.338364,0.337845,-0.584035,0.579896,0.271455,-0.471288,4
4,1.371197,2.450367,0.224054,0.469099,0.438397,-1.119801,0.337845,-0.081469,1.176486,0.523111,-0.471288,6
...,...,...,...,...,...,...,...,...,...,...,...,...
20753,0.227725,0.760293,0.996987,0.469099,0.888355,0.338364,0.337845,0.201151,0.416056,-0.697686,-0.471288,3
20754,-1.027052,0.111729,-1.436296,-2.131745,1.039171,1.756085,-1.889204,-1.691863,1.214691,0.636513,-0.471288,0
20755,-0.657669,1.366537,0.670717,0.469099,-0.071439,0.338364,0.337845,-0.048349,0.210303,0.966092,1.639846,3
20756,1.760067,-0.002805,-0.165574,0.469099,0.422594,-1.119801,0.337845,0.189694,-1.171141,0.593055,1.639846,6


In [19]:
def Label_Encoding(data1):
        data=data1
        enc=[]
        x=pd.DataFrame(data.dtypes)
        ll=list(x[x[0]=="object"].index)
        print(ll)
        for i in ll:
            label_encoder = LabelEncoder()
            label_encoder.fit(data[i])
            data[i]= label_encoder.transform(data[i])
            enc.append(label_encoder)
        return (data,enc)

In [None]:
data['id'].unique()

array([    0,     1,     2, ..., 20755, 20756, 20757])

In [50]:
data.drop('id',axis=1,inplace=True)

In [None]:
def outlier_remove(data,col):

        q1=data[col].quantile(0.25)
        q3=data[col].quantile(0.75)
        iqr=q3-q1
        l_whis=q1-1.5*iqr
        u_whis=q3+1.5*iqr
        data= data[(data[col]>=l_whis)& (data[col]<=u_whis)]
        return data


In [None]:
data=outlier_remove(data,'SMOKE')

In [None]:
data=outlier_remove(data,'FAVC')

In [None]:
data=outlier_remove(data,'CAEC')

In [None]:
data=outlier_remove(data,'SCC')

In [51]:
le=Label_Encoding(data)

['family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']


In [None]:
data.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [7]:

def StdScale(data1):
        data=data1
        for i in data.columns:
            if data[i].dtypes!='object' and i!='NObeyesdad':
                scale = StandardScaler().fit(data[[i]])

                data[i] = scale.transform(data[[i]])
        return data

In [6]:
def drop_vif(data,thresh=3.5,col_Spare=['NObeyesdad','intercept']):


        vif=Utils_Suite(data).compute_vif()
        z1=vif[vif["vif"]>thresh]
        z1=z1.sort_values(by='vif', kind='mergesort',ascending=[False])
        while True:
            try:
                col=z1.iloc[0,0]
                if z1.empty:
                    break
                if col in col_Spare:
                    z1=z1.iloc[1:]
                    continue
                data=data.drop(col,axis=1)
                vif=Utils_Suite(data).compute_vif()
                z1=vif[vif["vif"]>thresh]
                z1=z1.sort_values(by='vif', kind='mergesort',ascending=[False])
            except IndexError:
                break
        return data

In [30]:
data

Unnamed: 0,id,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,-1.731967,0.105699,-0.002828,-0.235713,0.469099,0.30588,-0.836279,0.337845,-0.109287,1.206594,-0.185009,-1.171141,0.597438,-0.471288,0.429319,6
1,-1.731800,-1.027052,-1.606291,-1.170931,0.469099,0.30588,-0.836279,-1.889204,-0.109287,-0.048349,-0.185009,0.021775,0.636513,1.639846,-2.182324,1
2,-1.731634,-1.027052,0.128451,-1.430012,0.469099,0.30588,-1.060332,0.337845,-0.109287,-0.195644,-0.185009,-0.138022,1.755239,1.639846,0.429319,0
3,-1.731467,-0.507929,0.120090,1.644770,0.469099,0.30588,1.039171,0.337845,-0.109287,-0.584035,-0.185009,0.579896,0.271455,-0.471288,0.429319,4
4,-1.731300,1.371197,2.450367,0.224054,0.469099,0.30588,0.438397,0.337845,-0.109287,-0.081469,-0.185009,1.176486,0.523111,-0.471288,0.429319,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,1.731300,0.227725,0.760293,0.996987,0.469099,0.30588,0.888355,0.337845,-0.109287,0.201151,-0.185009,0.416056,-0.697686,-0.471288,0.429319,3
20754,1.731467,-1.027052,0.111729,-1.436296,-2.131745,0.30588,1.039171,-1.889204,-0.109287,-1.691863,-0.185009,1.214691,0.636513,-0.471288,0.429319,0
20755,1.731634,-0.657669,1.366537,0.670717,0.469099,0.30588,-0.071439,0.337845,-0.109287,-0.048349,-0.185009,0.210303,0.966092,1.639846,0.429319,3
20756,1.731800,1.760067,-0.002805,-0.165574,0.469099,0.30588,0.422594,0.337845,-0.109287,0.189694,-0.185009,-1.171141,0.593055,1.639846,-2.182324,6


In [None]:
Utils_Suite(data).compute_correlation()

Gender    0.046575
Age       0.283018
Height    0.060786
FCVC      0.041076
NCP      -0.091154
CAEC      0.297420
CH2O      0.187100
FAF      -0.096643
TUE      -0.076040
CALC     -0.168497
Name: NObeyesdad, dtype: float64

In [None]:
data.drop(columns=['FAVC','SMOKE','SCC','MTRANS'],inplace=True)

In [None]:
def drop_correlation(data1):
        data=data1
        k=Utils_Suite(data).compute_correlation(0.3)
        f=pd.DataFrame(k)
        m=list(f[(f['SalePrice']<0.1) & (f['SalePrice']>-0.1)].index)
        data=data.drop(columns=m)
        print(m)
        return data

In [None]:
 X_train1, X_test, y_train1, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

 X_train, X_val, y_train, y_val = train_test_split(X_train1, y_train1, test_size=0.25, random_state=1)

In [23]:
d1=drop_vif(data)

In [24]:
d1

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,-1.731967,1.004152,0.105699,-0.002828,-0.235713,0.469099,0.30588,-0.836279,0.314684,0.337845,-0.109287,1.206594,-0.185009,-1.171141,0.597438,-0.471288,0.429319,6
1,-1.731800,-0.995866,-1.027052,-1.606291,-1.170931,0.469099,0.30588,-0.836279,0.338364,-1.889204,-0.109287,-0.048349,-0.185009,0.021775,0.636513,1.639846,-2.182324,1
2,-1.731634,-0.995866,-1.027052,0.128451,-1.430012,0.469099,0.30588,-1.060332,-1.913423,0.337845,-0.109287,-0.195644,-0.185009,-0.138022,1.755239,1.639846,0.429319,0
3,-1.731467,-0.995866,-0.507929,0.120090,1.644770,0.469099,0.30588,1.039171,0.338364,0.337845,-0.109287,-0.584035,-0.185009,0.579896,0.271455,-0.471288,0.429319,4
4,-1.731300,1.004152,1.371197,2.450367,0.224054,0.469099,0.30588,0.438397,-1.119801,0.337845,-0.109287,-0.081469,-0.185009,1.176486,0.523111,-0.471288,0.429319,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,1.731300,1.004152,0.227725,0.760293,0.996987,0.469099,0.30588,0.888355,0.338364,0.337845,-0.109287,0.201151,-0.185009,0.416056,-0.697686,-0.471288,0.429319,3
20754,1.731467,1.004152,-1.027052,0.111729,-1.436296,-2.131745,0.30588,1.039171,1.756085,-1.889204,-0.109287,-1.691863,-0.185009,1.214691,0.636513,-0.471288,0.429319,0
20755,1.731634,1.004152,-0.657669,1.366537,0.670717,0.469099,0.30588,-0.071439,0.338364,0.337845,-0.109287,-0.048349,-0.185009,0.210303,0.966092,1.639846,0.429319,3
20756,1.731800,1.004152,1.760067,-0.002805,-0.165574,0.469099,0.30588,0.422594,-1.119801,0.337845,-0.109287,0.189694,-0.185009,-1.171141,0.593055,1.639846,-2.182324,6


In [26]:
data.drop('id')

KeyError: "['id'] not found in axis"

In [53]:
data

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0.105699,-0.002828,-0.235713,0.469099,0.30588,-0.836279,0.337845,-0.109287,1.206594,-0.185009,-1.171141,0.597438,-0.471288,0.429319,6
1,-1.027052,-1.606291,-1.170931,0.469099,0.30588,-0.836279,-1.889204,-0.109287,-0.048349,-0.185009,0.021775,0.636513,1.639846,-2.182324,1
2,-1.027052,0.128451,-1.430012,0.469099,0.30588,-1.060332,0.337845,-0.109287,-0.195644,-0.185009,-0.138022,1.755239,1.639846,0.429319,0
3,-0.507929,0.120090,1.644770,0.469099,0.30588,1.039171,0.337845,-0.109287,-0.584035,-0.185009,0.579896,0.271455,-0.471288,0.429319,4
4,1.371197,2.450367,0.224054,0.469099,0.30588,0.438397,0.337845,-0.109287,-0.081469,-0.185009,1.176486,0.523111,-0.471288,0.429319,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20753,0.227725,0.760293,0.996987,0.469099,0.30588,0.888355,0.337845,-0.109287,0.201151,-0.185009,0.416056,-0.697686,-0.471288,0.429319,3
20754,-1.027052,0.111729,-1.436296,-2.131745,0.30588,1.039171,-1.889204,-0.109287,-1.691863,-0.185009,1.214691,0.636513,-0.471288,0.429319,0
20755,-0.657669,1.366537,0.670717,0.469099,0.30588,-0.071439,0.337845,-0.109287,-0.048349,-0.185009,0.210303,0.966092,1.639846,0.429319,3
20756,1.760067,-0.002805,-0.165574,0.469099,0.30588,0.422594,0.337845,-0.109287,0.189694,-0.185009,-1.171141,0.593055,1.639846,-2.182324,6


In [52]:
data=StdScale(data)

In [None]:
data.describe()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,1.3349630000000001e-17,1.889486e-16,-1.107677e-15,-1.29731e-16,2.8753050000000004e-17,-5.2371630000000003e-17,1.095354e-16,1.7628360000000002e-17,4.5525670000000007e-17,9.584351e-18,-2.931785e-16,3.5599020000000005e-17,3.9022e-17,1.328117e-16,1.266504e-16,-2.6014670000000003e-17,2.991473
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.893176
min,-0.9958656,-1.730295,-2.866172,-1.853297,-2.131745,-3.26926,-2.711729,-2.497077,-4.116253,-0.109287,-1.691863,-0.1850094,-1.171141,-1.024344,-2.582422,-2.182324,0.0
25%,-0.9958656,-0.6754304,-0.7832905,-0.8297482,0.4690992,0.3058796,-0.836279,0.3383641,0.3378445,-0.109287,-0.3901641,-0.1850094,-1.161583,-1.024344,-0.471288,0.4293186,1.0
50%,-0.9958656,-0.1804501,-0.002805357,-0.1449229,0.4690992,0.3058796,-0.0976573,0.3383641,0.3378445,-0.109287,-0.0483493,-0.1850094,0.02177483,-0.07119963,-0.471288,0.4293186,3.0
75%,1.004152,0.3794339,0.7174689,0.8989331,0.4690992,0.3058796,1.039171,0.3383641,0.3378445,-0.109287,0.854954,-0.1850094,0.722501,0.6365129,-0.471288,0.4293186,4.0
max,1.004152,6.53281,3.154492,2.925436,0.4690992,0.3058796,1.039171,1.756085,2.564893,9.15022,1.595165,5.40513,2.407607,2.297369,1.639846,1.299866,6.0


In [54]:
from sklearn.model_selection import train_test_split
x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2024)

In [126]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


In [127]:
from lazypredict.Supervised import LazyClassifier

In [128]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(x_train, x_test, y_train, y_test)

 97%|█████████▋| 28/29 [01:18<00:01,  1.63s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001960 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 14530, number of used features: 14
[LightGBM] [Info] Start training from score -2.108065
[LightGBM] [Info] Start training from score -1.896432
[LightGBM] [Info] Start training from score -1.975100
[LightGBM] [Info] Start training from score -1.855115
[LightGBM] [Info] Start training from score -1.641964
[LightGBM] [Info] Start training from score -2.150895
[LightGBM] [Info] Start training from score -2.093441


100%|██████████| 29/29 [01:20<00:00,  2.77s/it]


In [130]:
hyperparameter_grid = {
    'n_estimators': [100, 400, 800],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.05, 0.1, 0.20],
    'min_child_weight': [1, 10, 100]
    }

In [135]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [136]:
cls_grid= xgb.XGBClassifier()
gr=GridSearchCV(cls_grid,param_grid=hyperparameter_grid,cv=5)  # class model,param,crossval=5
gr.fit(x_train,y_train)
gr.best_params_

KeyboardInterrupt: 

In [129]:
print(models)

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
XGBClassifier                      0.90               0.89    None      0.90   
LGBMClassifier                     0.90               0.89    None      0.90   
RandomForestClassifier             0.89               0.88    None      0.89   
BaggingClassifier                  0.88               0.87    None      0.88   
ExtraTreesClassifier               0.87               0.85    None      0.87   
SVC                                0.85               0.84    None      0.85   
LogisticRegression                 0.85               0.83    None      0.85   
DecisionTreeClassifier             0.84               0.83    None      0.84   
LinearDiscriminantAnalysis         0.81               0.79    None      0.81   
NuSVC                              0.80               0.79    None      0.80   
ExtraTreeClassifier                0.75 

In [55]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
clf.fit(x_train, y_train)

In [56]:
y_pred=clf.predict(x_test)

In [57]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8919396274887604

In [None]:
y_predVal=clf.predict(X_train)
accuracy_score(y_train,y_predVal)

0.9980392156862745

In [None]:
y_predVal

array([3, 4, 5, ..., 5, 3, 1])

In [None]:
data.describe()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,-4.381418e-17,1.3349630000000001e-17,1.889486e-16,-1.107677e-15,-1.29731e-16,2.8753050000000004e-17,-5.2371630000000003e-17,1.095354e-16,1.7628360000000002e-17,4.5525670000000007e-17,9.584351e-18,-2.931785e-16,3.5599020000000005e-17,3.9022e-17,1.328117e-16,1.266504e-16,-2.6014670000000003e-17,2.991473
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.893176
min,-1.731967,-0.9958656,-1.730295,-2.866172,-1.853297,-2.131745,-3.26926,-2.711729,-2.497077,-4.116253,-0.109287,-1.691863,-0.1850094,-1.171141,-1.024344,-2.582422,-2.182324,0.0
25%,-0.8659837,-0.9958656,-0.6754304,-0.7832905,-0.8297482,0.4690992,0.3058796,-0.836279,0.3383641,0.3378445,-0.109287,-0.3901641,-0.1850094,-1.161583,-1.024344,-0.471288,0.4293186,1.0
50%,0.0,-0.9958656,-0.1804501,-0.002805357,-0.1449229,0.4690992,0.3058796,-0.0976573,0.3383641,0.3378445,-0.109287,-0.0483493,-0.1850094,0.02177483,-0.07119963,-0.471288,0.4293186,3.0
75%,0.8659837,1.004152,0.3794339,0.7174689,0.8989331,0.4690992,0.3058796,1.039171,0.3383641,0.3378445,-0.109287,0.854954,-0.1850094,0.722501,0.6365129,-0.471288,0.4293186,4.0
max,1.731967,1.004152,6.53281,3.154492,2.925436,0.4690992,0.3058796,1.039171,1.756085,2.564893,9.15022,1.595165,5.40513,2.407607,2.297369,1.639846,1.299866,6.0


In [None]:
data.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC',
       'MTRANS', 'NObeyesdad'],
      dtype='object')

In [108]:
data_pred=pd.read_csv(r'test.csv')

In [110]:
dd11=list(data.columns)
dd11.remove('NObeyesdad')

In [111]:
data_pred=data_pred[dd11]

In [40]:
data_pred

Unnamed: 0,id,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,26.899886,1.848294,120.644178,yes,yes,2.938616,Sometimes,no,2.825629,no,0.855400,0.000000,Sometimes,Public_Transportation
1,20759,21.000000,1.600000,66.000000,yes,yes,2.000000,Sometimes,no,3.000000,no,1.000000,0.000000,Sometimes,Public_Transportation
2,20760,26.000000,1.643355,111.600553,yes,yes,3.000000,Sometimes,no,2.621877,no,0.000000,0.250502,Sometimes,Public_Transportation
3,20761,20.979254,1.553127,103.669116,yes,yes,2.000000,Sometimes,no,2.786417,no,0.094851,0.000000,Sometimes,Public_Transportation
4,20762,26.000000,1.627396,104.835346,yes,yes,3.000000,Sometimes,no,2.653531,no,0.000000,0.741069,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13835,34593,23.327836,1.721384,78.030383,yes,no,2.813234,Sometimes,no,1.000000,no,0.807076,0.778632,Sometimes,Public_Transportation
13836,34594,29.000000,1.590000,62.000000,no,yes,3.000000,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation
13837,34595,22.935612,1.585547,44.376637,no,yes,3.000000,Frequently,no,2.000000,no,1.949840,1.000000,Sometimes,Public_Transportation
13838,34596,21.000000,1.620000,53.000000,yes,yes,2.000000,Sometimes,no,2.000000,no,3.000000,2.000000,no,Public_Transportation


In [None]:
data_pred.dtypes

Age       float64
Height    float64
Weight    float64
FCVC      float64
NCP       float64
CH2O      float64
FAF       float64
TUE       float64
CALC       object
dtype: object

In [None]:
data.dtypes

Age           float64
Height        float64
Weight        float64
FCVC          float64
NCP           float64
CH2O          float64
FAF           float64
TUE           float64
CALC          float64
NObeyesdad      int64
dtype: object

In [None]:
le[1][-3].classes_

array(['Frequently', 'Sometimes', 'no'], dtype=object)

In [None]:
len(le[1])

9

In [None]:
le[1]

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [112]:
ll=le[1].copy()

In [81]:
ll

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [113]:
ll.pop(-1)

In [114]:
ll1=ll.copy()


In [None]:
ll.pop(-1)

In [None]:
le[1]

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [None]:
le[-1][-1].classes_

array(['Automobile', 'Bike', 'Motorbike', 'Public_Transportation',
       'Walking'], dtype=object)

In [None]:
x=pd.DataFrame(data_pred.dtypes)
ll=list(x[x[0]=="object"].index)
ll['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']


['Gender',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS']

In [None]:
le[-1][-2].classes_

array(['Frequently', 'Sometimes', 'no'], dtype=object)

In [94]:
id=data_pred['id']

In [93]:
data_pred=data_pred.drop(data_pred[data_pred['CALC']=='Always'].index)

In [109]:
data_pred['CALC'] = np.where(data_pred['CALC'] == 'Always', 'Frequently', data_pred['CALC'])


In [None]:
data_pred['CALC'].unique()

array(['Sometimes', 'no', 'Frequently', 'Always'], dtype=object)

In [None]:
label_encoder=le[1][-3]
data_pred['CALC']= label_encoder.transform(data_pred['CALC'])

In [115]:
label_encoder = ll1
x=pd.DataFrame(data_pred.dtypes)
ll=list(x[x[0]=="object"].index)
j=0
for i in ll:
    print(i)
    data_pred[i]= label_encoder[j].transform(data_pred[i])
    j+=1

family_history_with_overweight
FAVC
CAEC
SMOKE
SCC
CALC
MTRANS


In [None]:
data['NObeyesdad'].unique()

array([6, 1, 0, 4, 3, 5, 2])

In [None]:
id                                  int64
Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object

In [None]:
data_pred.drop('NCP',axis=1,inplace=True)

In [None]:
le[-1]

[LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder(),
 LabelEncoder()]

In [74]:
data_pred1=pd.read_csv(r'test.csv')

In [67]:
data_pred

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,26.899886,1.848294,120.644178,1,1,2.938616,2,0,2.825629,0,0.855400,0.000000,1,3
1,21.000000,1.600000,66.000000,1,1,2.000000,2,0,3.000000,0,1.000000,0.000000,1,3
2,26.000000,1.643355,111.600553,1,1,3.000000,2,0,2.621877,0,0.000000,0.250502,1,3
3,20.979254,1.553127,103.669116,1,1,2.000000,2,0,2.786417,0,0.094851,0.000000,1,3
4,26.000000,1.627396,104.835346,1,1,3.000000,2,0,2.653531,0,0.000000,0.741069,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13835,23.327836,1.721384,78.030383,1,0,2.813234,2,0,1.000000,0,0.807076,0.778632,1,3
13836,29.000000,1.590000,62.000000,0,1,3.000000,2,0,2.000000,0,0.000000,0.000000,1,3
13837,22.935612,1.585547,44.376637,0,1,3.000000,1,0,2.000000,0,1.949840,1.000000,1,3
13838,21.000000,1.620000,53.000000,1,1,2.000000,2,0,2.000000,0,3.000000,2.000000,2,3


In [116]:
data_pred=StdScale(data_pred)

In [117]:
d1=data_pred

In [118]:
y_pred=clf.predict(d1.values)

In [119]:
y_pred=le[-1][-1].inverse_transform(y_pred)

In [120]:
y_pred

array(['Obesity_Type_II', 'Overweight_Level_I', 'Obesity_Type_III', ...,
       'Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_II'],
      dtype=object)

In [122]:
#y_pred=np.ndarray.flatten(y_pred)
y_pred=pd.Series(y_pred)
ds=pd.concat([pd.Series(data_pred1['id'].values),y_pred],axis=1)
ds.columns=['id','NObeyesdad']
ds.to_csv('randomforest_fixed_final_revision.csv',index=False)

In [None]:
data_pred1=pd.read_csv('test.csv')

In [None]:
from catboost import CatBoostClassifier


In [None]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
cat_boost_classifier = CatBoostClassifier(verbose=0)
cat_boost_classifier.fit(x_train, y_train)
kf = KFold(n_splits = 5)
scores = cross_val_score(cat_boost_classifier, x_train, y_train, scoring= 'accuracy', cv= kf)

In [None]:
scores.mean()


0.9033109612026161

In [None]:
pred_cat_boost_classifier = le[-1][-1].inverse_transform(cat_boost_classifier.predict(data_pred).ravel())


In [None]:
pred_cat_boost_classifier==y_pred

array([ True,  True,  True, ...,  True,  True,  True])

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
y_pred

0            Obesity_Type_II
1         Overweight_Level_I
2           Obesity_Type_III
3             Obesity_Type_I
4           Obesity_Type_III
                ...         
13833    Overweight_Level_II
13834          Normal_Weight
13835    Insufficient_Weight
13836    Insufficient_Weight
13837        Obesity_Type_II
Length: 13838, dtype: object