## Import Libraries and Configure Notebook

In [1]:
# %pip install lazypredict 

In [2]:
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
import plotly.graph_objects as go


from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
# import lazypredict
# from lazypredict.Supervised import LazyClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score
import optuna


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Constants
RANDOM = 1992

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


In [3]:
# Color Palettes
colors = {
    "sage"     : "#53808D",
    "orange"   : "#FC4C02",
    "blue"     : "#1B99D6",
    "yellow"   : "#FFF57B",
    "darkgrey" : "#585858",
    "lightgrey": "#D5DADD",
    "lightsage": '#E9F2EF'
}
palette_discete = sns.color_palette([v for k,v in colors.items()])
palette_continuous = sns.color_palette(f"blend:{colors['sage']},{colors['orange']}", as_cmap=True) 

# custom theme
rc = {
    'axes.facecolor':f"{colors['lightsage']}",
    "axes.edgecolor":f"{colors['sage']}",
    "text.color":f"{colors['sage']}",
    "axes.labelcolor":f"{colors['sage']}",
    "ytick.color":f"{colors['sage']}",
    "xtick.color":f"{colors['sage']}",
}
sns.set_style('whitegrid',rc =rc)

# set context elements
sns.set_context('notebook',rc={'grid.linewidth':'1.1'})

# set custom palette
sns.set_palette(palette_discete)

palette_discete



## Data Descriptions

The attributes related with eating habits are: 
+ Frequent consumption of high caloric food (FAVC)
+ Frequency of consumption of vegetables (FCVC)
+ Number of main meals (NCP)
+ Consumption of food between meals (CAEC) => Ordinal  (always, frequently, sometimes, no=never)
+ Consumption of water daily (CH20)
+ Consumption of alcohol (CALC) => Ordinal (frequently, sometimes, no=never)

The attributes related with the physical condition are: 
+ Calories consumption monitoring (SCC)
+ Physical activity frequency (FAF)
+ Time using technology devices (TUE)
+ Transportation used (MTRANS)

variables obtained
+ Gender
+ Age
+ Height
+ Weight

NObesity values are

+ Insufficient_Weight Less than 18.5
+ Normal_Weight 18.5 to 24.9
+ Overweight 25.0 to 29.9
+ Obesity_Type_I 30.0 to 34.9
+ Obesity_Type_II 35.0 to 39.9
+ Obesity_Type_III Higher than 40

In [4]:
def get_data():
    """Read data from the csv, adjust data types"""
    train= pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
    test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
    
    # category features 
    cols = ['Gender','CAEC','CALC','MTRANS','NObeyesdad']
    for col in cols:
        if col == 'NObeyesdad':
            # test data doesn't have NObeyesdad
            train[col] = train[col].apply(lambda x: x.strip()).astype('category')
        else:
            train[col] = train[col].apply(lambda x: x.lower().strip()).astype('category')
            test[col] = test[col].apply(lambda x: x.lower().strip()).astype('category')
      
            
    # bindary features
    cols = ['family_history_with_overweight','FAVC','SMOKE','SCC']
    for col in cols:
        train[col] = train[col].map({'yes':1,'no':0}).astype('uint8')
        test[col] = test[col].map({'yes':1,'no':0}).astype('uint8')
        
    # rename columns
    train.rename(columns = {'family_history_with_overweight':'FamiliyHistory'},inplace=True)
    test.rename(columns = {'family_history_with_overweight':'FamiliyHistory'},inplace=True)
    
    return train, test


# get the raw data
train, test = get_data()
gc.collect()
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,FamiliyHistory,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,male,24.443011,1.699998,81.66995,1,1,2.0,2.983297,sometimes,0,2.763573,0,0.0,0.976473,sometimes,public_transportation,Overweight_Level_II
1,1,female,18.0,1.56,57.0,1,1,2.0,3.0,frequently,0,2.0,0,1.0,1.0,no,automobile,Normal_Weight
2,2,female,18.0,1.71146,50.165754,1,1,1.880534,1.411685,sometimes,0,1.910378,0,0.866045,1.673584,no,public_transportation,Insufficient_Weight
3,3,female,20.952737,1.71073,131.274851,1,1,3.0,3.0,sometimes,0,1.674061,0,1.467863,0.780199,sometimes,public_transportation,Obesity_Type_III
4,4,male,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,sometimes,0,1.979848,0,1.967973,0.931721,sometimes,public_transportation,Overweight_Level_II


In [5]:
def add_features(df):
    """Add additional features"""
    dff = df.copy()
    
    # body-mass-index
    dff['BMI'] = dff['Weight'] / dff['Height']**2
    
    # physical transportation
    dff['TransPhysical'] = dff['MTRANS'].apply(lambda x: 1 if  x=='walking' or x=='bike' else 0)
    
    return dff

In [6]:
def encode_target_variable(df,direction='encode'):
    """
    Encode the target variable to ordinal integers for classification
    Decode the target variable from ordinal integers back to string for submission
    """
    if direction=='encode':
        encode = {
            'Insufficient_Weight': 0,
            'Normal_Weight'      : 1,
            'Overweight_Level_I' : 2,
            'Overweight_Level_II': 3,
            'Obesity_Type_I'     : 4,
            'Obesity_Type_II'    : 5,
            'Obesity_Type_III'   : 6
        }
        encoded = df.NObeyesdad.map(encode).to_frame('Target')
        dff = pd.concat([df,encoded],axis=1)
    else:
        decode = {
            0:'Insufficient_Weight',
            1:'Normal_Weight'      ,
            2:'Overweight_Level_I' ,
            3:'Overweight_Level_II',
            4:'Obesity_Type_I'     ,
            5:'Obesity_Type_II'    ,
            6:'Obesity_Type_III'   
        }
        decoded = df.Target.map(decode).to_frame('NObeyesdad')
        dff = pd.concat([df,decoded],axis=1).drop(columns=['Target'])
    return dff

In [7]:
def describe_train_test(y_train, y_test):
    """"""
        
    tn = pd.DataFrame(y_train.value_counts()).reset_index()
    tn['train_pct'] = round((tn['count']/tn['count'].sum())*100,2)
    tn.drop(columns=['count'],inplace=True)
    
    ts = pd.DataFrame(y_test.value_counts()).reset_index()
    ts['test_pct'] = round((ts['count']/ts['count'].sum())*100,2)
    ts.drop(columns=['count'],inplace=True)
    
    return tn.merge(ts, on='Target',how='inner').set_index('Target')    

In [8]:
# encode the target variable
train = encode_target_variable(train,'encode')

# add features
train = add_features(train)
test = add_features(test)

# test data is used for submission (does not include the Exited(y) variable)
X_submit_ids = test.id  # keep ids for later
X_submit = test.drop(columns = ['id'])

# train data
X = train.drop(columns=['Target','id','NObeyesdad'])
y = train.Target

# split train data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM,stratify=y)

# remove train/test
# del train
# del test
# gc.collect()

# describe train/test split
describe_train_test(y_train,y_test)

Unnamed: 0_level_0,train_pct,test_pct
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
6,19.49,19.48
5,15.64,15.66
1,14.84,14.86
4,14.02,14.02
0,12.15,12.16
3,12.15,12.14
2,11.69,11.68


In [9]:
def convert_to_df (transformed, processor):
    """Convert from array to dataframe after preprocessing the data"""
    df = pd.DataFrame(transformed)
    df.columns = processor.get_feature_names_out()
    return df
    

## Pipelines and Transformers
https://medium.com/analytics-vidhya/how-to-apply-preprocessing-steps-in-a-pipeline-only-to-specific-features-4e91fe45dfb8

In [10]:
# numeric transformers
numeric_features = ['Age','Height','Weight','FCVC','NCP','CH2O','FAF','TUE','BMI']
numeric_transformer = Pipeline(
    steps = [('scaler',StandardScaler())]
)

# categorical encoders
category_features =  ['Gender','MTRANS']
category_transformer = Pipeline(
    steps =[('onehot_encoder',OneHotEncoder())]
)

# ordinal features
ordinal_features = ['CAEC','CALC']
ordinal_feature_order = [['always','frequently','sometimes','no'],['always','frequently','sometimes','no']]
ordinal_transformer = Pipeline(
    steps =[('ordinal_encoder',OrdinalEncoder(categories = ordinal_feature_order))]
)

# apply the column transforms
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer,numeric_features),
        ('cat',category_transformer,category_features),
        ('ord',ordinal_transformer,ordinal_features),
    ],
    remainder = 'passthrough'
)

# prepare data for classification
X_array = preprocessor.fit_transform(X)
X_train_array = preprocessor.fit_transform(X_train)
X_test_array = preprocessor.fit_transform(X_test)
X_submit_array = preprocessor.fit_transform(X_submit)


# convert to dataframes
X = convert_to_df(X_array,preprocessor)
X_train = convert_to_df(X_train_array,preprocessor)
X_test =  convert_to_df(X_test_array,preprocessor)
X_submit = convert_to_df(X_submit_array,preprocessor)


# clear memmory
gc.collect()


108

## Exploratory Data Analysis

In [11]:
def plot_correlations(X_train, y_train, y_label, only_vs_target = True):
    """ Plot the correlations between features"""
    
    df_y = pd.DataFrame(y_train)
    df_y.columns = [y_label]
    
    # combine with input features
    df = pd.concat([X_train,df_y],axis=1)
    
    # calc the correlations
    c = df.corr().round(decimals=2)
    
    # subset to only show correlations with the target variable
    if only_vs_target:
        c = pd.DataFrame(c[y_label].sort_values()).T
        w = 1200
        h = 500
        title = "<b>Correlation of Features to Target</b>"
    else:
        w = 1000
        h = 1000
        title = "<b>Correlation of Features</b>"
    
    # plot the correlation matrix
    fig = px.imshow(
        c, 
        text_auto=True, 
        color_continuous_scale = 'RdYlbu',
        title = title,
        template = 'plotly_white',
        width = w,
        height= h,
    )
    fig.show()

In [12]:
plot_correlations(X_train, y_train, 'Target',False)

## Model Training
### Plotting & Evaluation Functions

In [13]:
def plot_training_metrics(model, model_type, metric='auc'):
    """
    Plot the evalution metric for train/validation datasets against each epoch
    Supports xgb, catboost and light gbm models
    """
    # extract train/validation results from the model
    if model_type == 'xgb':
        d = model.evals_result_
        t = d['validation_0'][metric]
        v = d['validation_1'][metric]
    elif model_type == "cat":
        d = model.evals_result_
        t = d["validation_0"][str.upper(metric)]
        v = d["validation_1"][str.upper(metric)]
    elif model_type == 'lgb':
        d = model.evals_result_
        t = d['training'][metric]
        v = d['valid_1'][metric]
        
    # convert to dataframe
    df_eval = pd.DataFrame(list(zip(t, v)), columns=["train", "val"])
    
    # create the plot
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=df_eval.index,
            y=df_eval.train,
            name="Training",
            line=dict(color=colors["sage"], width=2),
        )
    )
    fig.add_trace(
        go.Scatter(
            x=df_eval.index,
            y=df_eval.val,
            name="Validation",
            line=dict(color=colors["orange"], width=2),
        )
    )
    fig.update_layout(
        template="plotly_white",
        title=f"<b>Training Metrics: {metric.upper()}</b>",
        width=800,
        height=400,
    )
    fig.show()

In [14]:
def get_feature_importance(model, model_type, importance_type, normalize=True):
    """
    Extract the feature importance from a trained model
    model: trained  model
    model_type: model name (xgb, lgbm, cat)
    importance_type: weight, gain, cover (only applies to xgb)
    """

    # extract feature importance for each cv
    if model_type == "xgb":
        scores = model.get_booster().get_score(importance_type=importance_type)
        df = pd.DataFrame(dict(feature=scores.keys(),score=scores.values()))
    elif model_type == "lgb":
        # importance_type is set during model training=> importance_type='gain','split'
        names = model.feature_name_
        scores = model.feature_importances_
        df = pd.DataFrame(dict(feature=names,score=scores))
    elif model_type == "cat":
        # only weight is available. Gain requires that training dataset and is calculated post-training
        df = (
             model.get_feature_importance(prettified=True)
            .rename(columns={'Feature Id':'feature','Importances':'score'})
        )
    else:
        print(f"Model not recognized:{model_type} (get_feature_importances)")
        df = None
    
    # normalize the score out of 100%
    if normalize:
        df['score'] = df["score"] / df["score"].sum()
        
    # sort
    df.sort_values(by="score", ascending=True, inplace=True)
    
    return df

In [15]:
def plot_feature_importance(model, model_type, importance_type, normalize=True, top_n=None):
    """
    Plot a bar chart of feature importance
    Params:
        df_importance: a dataframe with the output from calc_feature_importance
    """
    # get feature importance
    df = get_feature_importance(model, model_type, importance_type, normalize)

    # filter by the top-n features & resort
    if top_n is not None:
        df = df.nlargest(top_n, "score")
        df.sort_values(by="score", ascending=True, inplace=True)

    # create plot
    fig = go.Figure()

    # create the plot of feature importance
    fig.add_trace(
        go.Bar(
            name="Importance",
            x=df.score,
            y=df.feature,
            orientation="h",
            marker=dict(color=colors["sage"]),
        )
    )
    fig.update_layout(
        title=f"<b>Feature Importance for the {model_type} model</b><br>Calculated using feature {importance_type}",
        template="plotly_white",
        width=500,
        height=500,
        yaxis=dict(type="category", tickmode="linear"),
    )
    fig.show()

### XGBoost
#### XGB Hyperparameter Tuning

In [16]:
def objective(trial):
    #XGB params to tune
    params = {
        'n_estimators'     : 500,
        'objective'        : 'multi:softprob',
        'tree_method'      :'auto',
        'eval_metric'      :'mlogloss',
        'random_state'     : RANDOM,
        'learning_rate'    : trial.suggest_float('learning_rate', 0.03, 0.1),  
        'max_depth'        : trial.suggest_int('max_depth',3, 12),
        'min_child_weight' : trial.suggest_int('min_child_weight',3,12),
        'gamma'            : trial.suggest_float('gamma', 0, 0.5), 
        'subsample'        : trial.suggest_float('subsample', 0.5, 0.9), 
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 0.9), 
        'lambda'           : trial.suggest_float('lambda', 0.5, 0.9),
        'alpha'            : trial.suggest_float('alpha', 0.5, 0.9),
    }
    # model
    xgb = XGBClassifier(**params)

    # use stratified kfold to evalute each set of model weights
    skf= StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM)

    # Calculate the auc for each set of model weights
    acc = abs(cross_val_score(xgb, X_train, y_train, cv=skf, scoring='accuracy').mean())
    
    return acc


# Optimize Hyperparameters
# =====================================================================================================
# # create the study and run the optimization
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM))
# study.optimize(objective, n_trials=50)

# # get the best weights from the optimization
# xgb_best = study.best_params
# print(study.best_value)
# print(xgb_best)

# # clear some memmory
# gc.collect()

In [17]:
xgb_best = {
    'learning_rate'   : 0.050609513550763256, 
    'max_depth'       : 4, 
    'min_child_weight': 8, 
    'gamma'           : 0.000577280415637595, 
    'subsample'       : 0.7309115689873373, 
    'colsample_bytree': 0.7157383187006658, 
    'lambda'          : 0.8253860980908846, 
    'alpha'           : 0.505765671772347,
    'objective'       : 'multi:softprob',
    'tree_method'     :'auto',
    'eval_metric'     :'mlogloss',
    'random_state'    : RANDOM,
}
xgb_params = {
    'n_estimators'          : 5000,
    'early_stopping_rounds' :75,
}
# train params
xgb_params.update(xgb_best)

# train
xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X_train, y_train,  eval_set =[(X_train,y_train),(X_test,y_test)],verbose=0)

# plot training metrics
plot_training_metrics(xgb_model, 'xgb', 'mlogloss')

# test on train data 
y_pred = xgb_model.predict(X_test)

# plot feature importance
plot_feature_importance(xgb_model,'xgb','gain',True,15)

# accuracy
acc = accuracy_score(y_test,y_pred)
print(f"Accuracy Score:{acc}")

# clear some memmory
gc.collect()

Accuracy Score:0.8754816955684007


1614

## Light GBM
### LGBM Hyperparameter Tuning

In [18]:
def objective(trial):
    #LGB params to tune
    params = {
        'objective'           : 'multiclass',
        'boosting_type'       : 'gbdt',
        'random_state'        : RANDOM,
        'n_estimators'        : 500,
        'metric'              : "multi_logloss",
        "verbosity"         : -1,
        'n_jobs'              : -1,
        'learning_rate'       : trial.suggest_float('learning_rate', 0.01, 0.1),  
        'max_depth'           : trial.suggest_int('max_depth',3, 12),
        'min_child_samples'   : trial.suggest_int('min_child_samples',10,60),
        'num_leaves'          : trial.suggest_int('num_leaves', 30, 60), 
        'subsample'           : trial.suggest_float('subsample', 0.3, 0.9), 
        'colsample_bytree'    : trial.suggest_float('colsample_bytree', 0.3, 0.9), 
        'reg_lambda'          : trial.suggest_float('reg_lambda', 0.1, 5),
        'reg_alpha'           : trial.suggest_float('reg_alpha', 0.1, 5),
    }

    # model
    lgb = LGBMClassifier(**params)

    # use stratified kfold to evalute each set of model weights
    skf= StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM)

    # Calculate the auc for each set of model weights
    acc = cross_val_score(lgb, X_train, y_train, cv=skf, scoring='accuracy').mean()
    
    return acc


# Optimize Hyperparameters
# =====================================================================================================
# # create the study and run the optimization
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM))
# study.optimize(objective, n_trials=50)

# # get the best weights from the optimization
# lgb_best = study.best_params
# print(study.best_value)
# print(lgb_best)

# # clear some memmory
# gc.collect()

In [19]:
lgb_best = {
    "objective"         : "multiclass",
    "metric"            : "multi_logloss",
    "verbosity"         : -1,
    "boosting_type"     : "gbdt",
    "random_state"      : RANDOM,
    'feature_pre_filter': False,
    'learning_rate'     : 0.02652139516618378, 
    'max_depth'         : 5, 
    'min_child_samples' : 12, 
    'num_leaves'        : 42, 
    'subsample'         : 0.8237381030243406, 
    'colsample_bytree'  : 0.5284799300520185, 
    'reg_lambda'        : 4.303870659102397, 
    'reg_alpha'         : 2.0590307690968483,
    'bagging_freq'      : 4,
}

# lgb_best = {
#     "objective"         : "multiclass",
#     "metric"            : "multi_logloss",
#     "verbosity"         : -1,
#     "boosting_type"     : "gbdt",
#     "random_state"      : RANDOM,
#     "num_class"         : 7,
#     "learning_rate"     :  0.01386432121252535,
#     'n_estimators'      : 494,
#     'feature_pre_filter': False,
#     'lambda_l1'         : 1.2149501037669967e-07,
#     'lambda_l2'         : 0.9230890143196759,
#     'num_leaves'        : 31,
#     'feature_fraction'  : 0.5,
#     'bagging_fraction'  : 0.5523862448863431,
#     'bagging_freq'      : 4,
#     'min_child_samples' : 20
# }
lgb_params = {
    'n_estimators'          : 5000,
    'early_stopping_rounds' :100
}
lgb_params.update(lgb_best)


lgb_model = LGBMClassifier(**lgb_params)
lgb_model.fit(X_train, y_train,  eval_set =[(X_train,y_train),(X_test,y_test)])

# plot training metrics
plot_training_metrics(lgb_model, 'lgb', 'multi_logloss')

# test on train data 
y_pred = lgb_model.predict(X_test)

# plot feature importance
plot_feature_importance(lgb_model,'lgb','weight',True, 15)

# accuracy
acc = accuracy_score(y_test,y_pred)
print(f"Accuracy Score:{acc}")

# clear some memmory
gc.collect()



Accuracy Score:0.8723506743737958


1522

## Submit the Model
### Single XGB Model

In [20]:
xgb_best = {
    'learning_rate'   : 0.050609513550763256, 
    'max_depth'       : 4, 
    'min_child_weight': 8, 
    'gamma'           : 0.000577280415637595, 
    'subsample'       : 0.7309115689873373, 
    'colsample_bytree': 0.7157383187006658, 
    'lambda'          : 0.8253860980908846, 
    'alpha'           : 0.505765671772347,
    'objective'       : 'multi:softprob',
    'tree_method'     :'auto',
    'eval_metric'     :'mlogloss',
    'random_state'    : RANDOM,
}
xgb_params = {
    'n_estimators'          : 250,
}
# train params
xgb_params.update(xgb_best)


# train the model on all available data
xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X,y,verbose=0)

# predict on the test set
y_pred = xgb_model.predict(X_submit)

# assemble the submission data, decode the target variable back to strings
df_submission = pd.DataFrame(zip(X_submit_ids,y_pred),columns = ['id','Target'])
df_submission = encode_target_variable(df_submission,'decode')
df_submission.to_csv("/kaggle/working/submission.csv",index=False)
df_submission.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III


### Single Light GBM Model

In [21]:
lgb_best = {
    "objective"         : "multiclass",
    "metric"            : "multi_logloss",
    "verbosity"         : -1,
    "boosting_type"     : "gbdt",
    "random_state"      : RANDOM,
    "num_class"         : 7,
    "learning_rate"     :  0.013,
    'feature_pre_filter': False,
    'lambda_l1'         : 1.2149e-07,
    'lambda_l2'         : 0.9230,
    'num_leaves'        : 31,
    'feature_fraction'  : 0.5,
    'bagging_fraction'  : 0.55231,
    'bagging_freq'      : 4,
    'min_child_samples' : 20
}
lgb_params = {
    'n_estimators'          : 500,
}
lgb_params.update(lgb_best)


# train the model on all available data
lgb_model = LGBMClassifier(**lgb_params)
lgb_model.fit(X,y)

# predict on the test set
y_pred = lgb_model.predict(X_submit)

# assemble the submission data, decode the target variable back to strings
df_submission = pd.DataFrame(zip(X_submit_ids,y_pred),columns = ['id','Target'])
df_submission = encode_target_variable(df_submission,'decode')
df_submission.to_csv("/kaggle/working/submission.csv",index=False)
df_submission.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III


## Lazy Classifer 

In [22]:

# lazy classifer 
# clf = LazyClassifier(verbose=0, ignore_warnings=True, predictions=True, custom_metric=None,random_state = RANDOM)
# models, predictions = clf.fit(X_train,X_test, y_train, y_test)

# gc.collect()

In [23]:
# models

In [24]:

# sns.relplot(data=train, kind='scatter',hue='Gender',y='Height',x='Weight',col='Target')

# sns.boxplot(data=train,x='Age',y='Target')
# sns.boxplot(data=train,x='Height',y='Target')
# sns.boxplot(data=train,x='Weight',y='Target')
# sns.boxplot(data=train,x='BMI',y='Target')

# sns.pairplot(data=train[cols_numeric], hue='Target')


In [25]:
# from ydata_profiling import ProfileReport
# ProfileReport(train)