In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import MinMaxScaler , LabelEncoder
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam , SGD
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import GridSearchCV
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Read Dataset**

In [None]:
dftrain = pd.read_csv('/kaggle/input/playground-series-s3e16/train.csv')
dftest = pd.read_csv('/kaggle/input/playground-series-s3e16/test.csv')

print('The dimension of dftrain',dftrain.shape)
print('The dimension of dftest',dftest.shape)

In [None]:
dftrain.info()

In [None]:
dftrain.head()

# **Data Cleaning**

In [None]:
missingtrain = dftrain.isnull().sum()
missingtest = dftest.isnull().sum()
print(missingtrain)
print("---------------------")
print(missingtest)

In [None]:
dftrain.describe()

In [None]:
print("number of zero in train['Height'] :", len(dftrain[dftrain['Height']==0]))
dftrain['Height'] = dftrain['Height'].replace({0:0.348089})
dftrain.describe()

In [None]:
dftrain

In [None]:
le = LabelEncoder()
dftrain['Sex'] = le.fit_transform(dftrain['Sex'])
dftest['Sex'] = le.fit_transform(dftest['Sex'])
dftrain

In [None]:
dftrain.corr()
corr = dftrain.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='magma', fmt=".2f")
plt.title("Correlation Coefficients")
plt.show()

# **Identify : X_Train & Y_Train**

In [None]:
X_Train = dftrain[['Shell Weight','Height','Diameter','Length']]
Y_Train = dftrain[['Age']]

X_Train1 = dftrain[['Sex','Length','Diameter','Height','Weight','Shucked Weight','Viscera Weight','Shell Weight']]
Y_Train1 = dftrain[['Age']]

Test = dftest[['Shell Weight','Height','Diameter','Length']]
Test1 = dftest[['Sex','Length','Diameter','Height','Weight','Shucked Weight','Viscera Weight','Shell Weight']]
# print(X_Train)
# print("------------------------------------")
# print(Test)

# **Normalize Data**

In [None]:
#normarlize ให้ค่าอยู่ระหว่าง 0 ถึง 1 
scaler = MinMaxScaler()
X_Train=pd.DataFrame(scaler.fit_transform(X_Train), index=X_Train.index, columns=X_Train.columns)
Test=pd.DataFrame(scaler.fit_transform(Test), index=Test.index, columns=Test.columns)
# print(X_Train)
# print('------------------')
# print(Test)

In [None]:
X_Train

# **Model Training & Evaluation**

In [None]:
LR_parameters = {'fit_intercept': [True, False]}
LGB_parameters = {
    'n_estimators': [100,500,1000],
    'learning_rate': [0.01,0.001]
}
XGB_parameters = {
    'n_estimators': [100,500,1000],
    'learning_rate': [0.01,0.001]
}
RF_parameters = {
    'n_estimators': [100,500,1000],
    'max_depth': [3,10]
}

mlp = Sequential()
mlp.add(Dense(128, activation='sigmoid', input_shape=(X_Train1.shape[1],)))
mlp.add(Dense(64, activation='sigmoid'))
mlp.add(Dense(1))
mlp.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

maeLRarr = []
maeLGBarr = []
maeXGBarr = []
maeRFarr = []
maeMLParr = []

cv = KFold(n_splits = 10)
i=0
for train_idx , test_idx in cv.split(X_Train1,Y_Train1):
    xxTrain , xxTest = X_Train.iloc[train_idx] , X_Train.iloc[test_idx]
    yyTrain , yyTest = Y_Train.iloc[train_idx] , Y_Train.iloc[test_idx]
    
    lr_grid_search = GridSearchCV(LinearRegression(), LR_parameters)
    lr_grid_search.fit(xxTrain, yyTrain)
    
    best_params = lr_grid_search.best_params_
    
    lr = LinearRegression(**best_params)
    lr.fit(xxTrain,yyTrain)
    
    pred = lr.predict(xxTest)
    maeLR = np.sqrt(mean_absolute_error(yyTest, np.round(pred)))
    maeLRarr.append(maeLR)
    print('Fold', i, '==> LinearRegression of MAE is ==>', maeLR)
    i=i+1

# ----------------------------------------------------------------------------------------------------------------    
cv = KFold(n_splits = 10)    
k=0
for train_idx1 , test_idx1 in cv.split(X_Train1,Y_Train1):
    xxTrain1 , xxTest1 = X_Train1.iloc[train_idx1] , X_Train1.iloc[test_idx1]
    yyTrain1 , yyTest1 = Y_Train1.iloc[train_idx1] , Y_Train1.iloc[test_idx1]
    
#     ------------------------- LGBM Regressor -------------------------------------------------------------------
    
    lgb_grid_search = GridSearchCV(LGBMRegressor(objective='mae'), LGB_parameters, scoring='neg_mean_absolute_error')
    lgb_grid_search.fit(xxTrain1, yyTrain1)

    lgb_best_params = lgb_grid_search.best_params_
    
    lgb = LGBMRegressor(**lgb_best_params,early_stopping_rounds=500)
    lgb.fit(xxTrain1,yyTrain1,eval_set=[(xxTrain1, yyTrain1), (xxTest1, yyTest1)],verbose=0)
    
    predLGB = lgb.predict(xxTest1)
    maeLGB = np.sqrt(mean_absolute_error(yyTest1, np.round(predLGB)))
    maeLGBarr.append(maeLGB)
#     ------------------------- XGB Regressor -------------------------------------------------------------------
    
    xgb_grid_search = GridSearchCV(XGBRegressor(objective="reg:pseudohubererror"), XGB_parameters, scoring='neg_mean_absolute_error')
    xgb_grid_search.fit(xxTrain1, yyTrain1)
    
    xgb_best_params = xgb_grid_search.best_params_
    
    xgb = XGBRegressor(**xgb_best_params,early_stopping_rounds=500)
    xgb.fit(xxTrain1,yyTrain1,eval_set=[(xxTrain1, yyTrain1), (xxTest1, yyTest1)],verbose=0)
    
    predXGB = xgb.predict(xxTest1)
    maeXGB = np.sqrt(mean_absolute_error(yyTest1, np.round(predXGB)))
    maeXGBarr.append(maeXGB)
#     ------------------------- Random Forrest -------------------------------------------------------------------
    
    rf_grid_search = GridSearchCV(RandomForestRegressor(), RF_parameters, scoring='neg_mean_absolute_error')
    yyTrain1_flattened = np.ravel(yyTrain1)
    rf_grid_search.fit(xxTrain1, yyTrain1_flattened)
    
    rf_best_params = rf_grid_search.best_params_
    
    rf = RandomForestRegressor(**rf_best_params)
    rf.fit(xxTrain1,yyTrain1_flattened)    
    predRF = rf.predict(xxTest1)
    maeRF = np.sqrt(mean_absolute_error(yyTest1, np.round(predRF)))
    maeRFarr.append(maeRF)
    
#     ------------------------- MLP -------------------------------------------------------------------
    
    mlp.fit(xxTrain1, yyTrain1,batch_size=32, epochs=50,verbose=0)  
    predMLP = mlp.predict(xxTest1)  
    maeMLP = np.sqrt(mean_absolute_error(yyTest1, np.round(predMLP))) 
    maeMLParr.append(maeMLP)
    
    
#     ------------------------------------------------------------------------------------------------   
    print('Fold', k, '==> LGBM of MAE is ==>', maeLGB)
    print('Fold', k, '==> XGBoost of MAE is ==>', maeXGB)
    print('Fold', k, '==> MLP of MAE is ==>', maeMLP)
    print('Fold', k, '==> RF of MAE is ==>', maeRF)
    k=k+1

In [None]:
print(lgb_best_params)
print(best_params)
print(xgb_best_params)
print(rf_best_params)

In [None]:
LR_score = np.mean(maeLRarr)
LGB_score = np.mean(maeLGBarr)
XGB_score = np.mean(maeXGBarr)
RF_score = np.mean(maeRFarr)
MLP_score = np.mean(maeMLParr)

model_perf = pd.DataFrame({'Model': ['Linear Regression' ,'XGBRegressor','LGBMRegressor', 'Random Forrest', 'Multi Layer Perceptron'],'mae-score': [LR_score, XGB_score,LGB_score, RF_score, MLP_score]})
plt.figure(figsize = (8, 8))
ax = sns.barplot(y = 'Model', x = 'mae-score', data = model_perf)
ax.bar_label(ax.containers[0]);

# **Predict Submission & Create Pickle File**

In [None]:
id = dftest['id']
final1 = np.round(lr.predict(Test))
final2 = np.round(xgb.predict(Test1))
final3 = np.round(lgb.predict(Test1))
final4 = np.round(mlp.predict(Test1))
final5 = np.round(rf.predict(Test1))

print(final1.shape)
final1 = final1.reshape(49368)
print(final1.shape)

print(final4.shape)
final4 = final4.reshape(49368)
print(final4.shape)

output1 = pd.DataFrame({'id': id,'Age': final1.astype(int)})
output2 = pd.DataFrame({'id': id,'Age': final2.astype(int)})
output3 = pd.DataFrame({'id': id,'Age': final3.astype(int)})
output4 = pd.DataFrame({'id': id,'Age': final4.astype(int)})
output5 = pd.DataFrame({'id': id,'Age': final5.astype(int)})

output1.to_csv('LinearRegression_submission.csv', index=False)
output2.to_csv('XGB_submission.csv', index=False)
output3.to_csv('LGB_submission.csv', index=False)
output4.to_csv('MLP_submission.csv', index=False)
output5.to_csv('RF_submission.csv', index=False)

In [None]:
with open("LinearRegression.pkl", 'wb') as file:
    pickle.dump(lr, file)
    
with open("XGBRegression.pkl", 'wb') as file:
    pickle.dump(xgb, file)

with open("LGBMRegression.pkl", 'wb') as file:
    pickle.dump(lgb, file)
    
with open("Random Forrest.pkl", 'wb') as file:
    pickle.dump(rf, file)
    
with open("Multi Layer Perceptron.pkl", 'wb') as file:
    pickle.dump(mlp, file)