In [1]:
import warnings
warnings.filterwarnings("ignore")

In [138]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow.keras.backend as K

In [139]:
original_df = pd.read_csv('./train.csv')
main_test_df = pd.read_csv('./test.csv')

In [140]:
no_outlier_df = original_df[original_df['Height'] <= 0.3]
no_outlier_df = no_outlier_df[no_outlier_df['Whole weight'] <= 2.25]
no_outlier_df = no_outlier_df[no_outlier_df['Whole weight.1'] <= 1]
no_outlier_df = no_outlier_df[no_outlier_df['Whole weight.2'] <= 0.5]
no_outlier_df = no_outlier_df[no_outlier_df['Shell weight'] <= 0.6]

In [141]:
def generate_features(data):
    df=data.copy()
    
    # Clean the weights by capping the over weights with total body weights
    df['Shell_weight']=np.where(df['Shell weight']>df['Whole weight'],df['Whole weight'],df['Shell weight'])
    df['Viscera_weight']=np.where(df['Whole weight.1']>df['Whole weight'],df['Whole weight'],df['Whole weight.1'])
    df['Shucked_weight']=np.where(df['Whole weight.2']>df['Whole weight'],df['Whole weight'],df['Whole weight.2'])
    
    # Abalone Surface area
    df["surface_area"]=df["Length"]*df["Diameter"]
    df['total_area']=2*(df["surface_area"]+df["Height"]*df["Diameter"]+df["Length"]*df["Height"])
    
    # Abalone density approx
    df['approx_density']=df['Whole weight']/(df['surface_area']*df['Height']+1e-5)
    
    # Abalone BMI
    df['bmi']=df['Whole weight']/(df['Height']**2+1e-5)
    
    # Measurement derived
    df["length_dia_ratio"]=df['Length']/(df['Diameter']+1e-5)
    df["length_height_ratio"]=df['Length']/(df['Height']+1e-5)
    df['shell_shuck_ratio']=df["Shell_weight"]/(df["Shucked_weight"]+1e-5)
    df['shell_viscera_ratio']=df['Shell_weight']/(df['Viscera_weight']+1e-5)
    
    df['viscera_tot_ratio']=df['Viscera_weight']/(df['Whole weight']  +1e-5)
    df['shell_tot_ratio']=df['Shell_weight']/(df['Whole weight']    +1e-5)
    df['shuck_tot_ratio']=df['Shucked_weight']/(df['Whole weight']   +1e-5)
    df['shell_body_ratio']=df['Shell_weight']/(df['Shell_weight']+df['Whole weight']+1e-5)
    df['flesh_ratio']=df['Shucked_weight']/(df['Whole weight']+df['Shucked_weight']+1e-5)
    
    df['inv_viscera_tot']= df['Whole weight'] / (df['Viscera_weight']+1e-5)
    df['inv_shell_tot']= df['Whole weight'] /( df['Shell_weight']+1e-5)
    df['inv_shuck_tot']= df['Whole weight'] / (df['Shucked_weight']+1e-5)
    
    df['Is Male'] = df['Sex'].apply(lambda x: 1 if x == "M" else 0)
    df['Is Female'] = df['Sex'].apply(lambda x: 1 if x == "F" else 0)
    df['Is Infant'] = df['Sex'].apply(lambda x: 1 if x == "I" else 0)
    
    # Water Loss during experiment
    df["water_loss"]=df["Whole weight"]-df["Shucked_weight"]-df['Viscera_weight']-df['Shell_weight']
    df["water_loss"]=np.where(df["water_loss"]<0,min(df["Shucked_weight"].min(),df["Viscera_weight"].min(),df["Shell_weight"].min()),df["water_loss"])
    return df

In [142]:
df = generate_features(no_outlier_df.copy())
test_df = generate_features(main_test_df.copy())

In [143]:
scaler = MinMaxScaler()
numerical_features = ['Length', 'Diameter', 'Height', 'Whole weight',
       'Whole weight.1', 'Whole weight.2', 'Shell weight',
       'Shell_weight', 'Viscera_weight', 'Shucked_weight', 'surface_area',
       'total_area', 'approx_density', 'bmi', 'length_dia_ratio',
       'length_height_ratio', 'shell_shuck_ratio', 'shell_viscera_ratio',
       'viscera_tot_ratio', 'shell_tot_ratio', 'shuck_tot_ratio',
       'shell_body_ratio', 'flesh_ratio', 'inv_viscera_tot', 'inv_shell_tot',
       'inv_shuck_tot', 'water_loss']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

In [144]:
train_df, valid_df = train_test_split(df, test_size=0.15, random_state=42)
features = ['Is Male', 'Is Female', 'Is Infant'] + numerical_features
target = ['Rings']

In [162]:
X_train, y_train = train_df[features], train_df[target]
X_valid, y_valid = valid_df[features], valid_df[target]
X_test = test_df[features]

In [163]:
def rmsle(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log)))

def lr_schedule(epoch):
    lr = 0.0001
    if epoch > 5:
        lr *= 0.07
    return lr

lr_scheduler = LearningRateScheduler(lr_schedule)

# SimpleNN Model

In [164]:
nn_model = Sequential([
    Dense(units=64, activation='relu', input_dim=len(features)),
    Dense(units=64, activation='relu'),
    Dense(units=64, activation='relu'),
    Dense(units=1, activation='relu')
])
nn_model.compile(loss=rmsle)
nn_history = nn_model.fit(X_train, y_train, epochs=8, batch_size=8, validation_data=(X_valid, y_valid), callbacks=[lr_scheduler])

Epoch 1/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 407us/step - loss: 0.3632 - val_loss: 0.1490 - learning_rate: 1.0000e-04
Epoch 2/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 402us/step - loss: 0.1498 - val_loss: 0.1463 - learning_rate: 1.0000e-04
Epoch 3/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 402us/step - loss: 0.1459 - val_loss: 0.1430 - learning_rate: 1.0000e-04
Epoch 4/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 408us/step - loss: 0.1447 - val_loss: 0.1425 - learning_rate: 1.0000e-04
Epoch 5/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 404us/step - loss: 0.1445 - val_loss: 0.1418 - learning_rate: 1.0000e-04
Epoch 6/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 405us/step - loss: 0.1440 - val_loss: 0.1418 - learning_rate: 1.0000e-04
Epoch 7/8
[1m9543/9543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 400us/step - los

In [165]:
nn_loss_list = nn_history.history['val_loss']
print ('NN Validation Loss:', nn_loss_list[-1])

NN Validation Loss: 0.14142972230911255


# CNN model

In [149]:
cnn_optimizer = Adam(learning_rate=0.0001)


cnn_model = Sequential([
    Flatten(input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')])

cnn_model.compile(optimizer=cnn_optimizer, loss=rmsle)

In [150]:
cnn_history = cnn_model.fit(X_train, y_train, epochs= 20, batch_size=32, validation_data=(X_valid, y_valid))

Epoch 1/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 421us/step - loss: 0.8073 - val_loss: 0.1793
Epoch 2/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 402us/step - loss: 0.1758 - val_loss: 0.1605
Epoch 3/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 399us/step - loss: 0.1602 - val_loss: 0.1553
Epoch 4/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 399us/step - loss: 0.1571 - val_loss: 0.1532
Epoch 5/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 396us/step - loss: 0.1545 - val_loss: 0.1521
Epoch 6/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 397us/step - loss: 0.1552 - val_loss: 0.1514
Epoch 7/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 398us/step - loss: 0.1538 - val_loss: 0.1518
Epoch 8/20
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 405us/step - loss: 0.1543 - val_loss: 0.1517
Epoch 9/

In [151]:
cnn_loss_list = cnn_history.history['val_loss']
print ('CNN Validation Loss:', cnn_loss_list[-1])

CNN Validation Loss: 0.1488444060087204


# RandomForest

In [152]:
rf_regressor = RandomForestRegressor(n_estimators = 100,
                                     random_state=42)
rf_regressor.fit(X_train, y_train.values.ravel())

In [153]:
y_pred = rf_regressor.predict(X_valid)

rmsle = mean_squared_log_error(y_valid, y_pred) ** 0.5

print("Root Mean Squared Logarithmic Error:", rmsle)

Root Mean Squared Logarithmic Error: 0.15226448964975275


# XGBoost

In [154]:
X, y = df[features], df[target]
X_np = X.values
y_np = y.values

In [155]:
xgb_params = {
    'n_estimators': 1000,
    'max_depth': 8,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'objective': 'reg:squarederror',
    'random_state': 42,
    'gamma': 0.4,
    'min_child_weight': 10,
}

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=32)
xgb_rmsle_scores = []

for train_index, test_index in kf.split(df):
    X_train, X_test = X_np[train_index], X_np[test_index]
    y_train, y_test = y_np[train_index], y_np[test_index]
    
    xgb_model = xgb.XGBRegressor(**xgb_params)
        
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    
    rmsle = mean_squared_log_error(y_test, y_pred) ** 0.5
    xgb_rmsle_scores.append(rmsle)

In [156]:
average_rmse = np.mean(xgb_rmsle_scores)

print ('XGBoost Validation Loss:', average_rmse)

XGBoost Validation Loss: 0.14945106954925463


# EnsembleModeling

In [166]:
X_test = test_df[features]

In [167]:
sub_df = pd.DataFrame()

In [168]:
sub_df['id'] = test_df['id']
sub_df['NN'] = nn_model.predict(X_test)
sub_df['CNN'] = cnn_model.predict(X_test)
sub_df['RF'] = rf_regressor.predict(X_test)
sub_df['XGB'] = xgb_model.predict(X_test)

[1m1888/1888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 322us/step
[1m1888/1888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 304us/step


In [169]:
sub_df

Unnamed: 0,id,NN,CNN,RF,XGB
0,90615,9.464512,9.045436,10.84,10.168097
1,90616,9.707209,9.738278,9.74,9.574944
2,90617,10.008175,10.265508,9.89,10.138338
3,90618,9.880308,10.084214,10.49,10.712008
4,90619,7.500868,7.556702,7.86,7.645997
...,...,...,...,...,...
60406,151021,6.362882,6.289547,6.67,6.412579
60407,151022,9.063062,9.336911,8.91,9.057267
60408,151023,11.453331,11.766652,13.51,12.689516
60409,151024,12.553601,12.768542,12.73,13.128807


In [172]:
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['Rings'] = (sub_df["NN"]+sub_df["CNN"]+sub_df["XGB"]+sub_df["RF"])/4
submission.to_csv('submission.csv', index=False)

In [173]:
submission

Unnamed: 0,id,Rings
0,90615,9.879512
1,90616,9.690108
2,90617,10.075505
3,90618,10.291633
4,90619,7.640892
...,...,...
60406,151021,6.433752
60407,151022,9.091810
60408,151023,12.354875
60409,151024,12.795238
