#Data Description
Sex: Sex (M: Male, F: Female, I: Infant)

Length: Longest Shell measurement (millimetres - mm)

Diameter: Diameter - perpendicular to length (mm)

Height: Height - with meat in shell (mm)

Whole weight: Weight of whole abalone (grams)

Shucked weight: Weight of meat (grams)

Viscera weight: Gut weight after bleeding (grams)

Shell weight: Shell weight - after being dried (grams)

Rings: Rings - value + 1.5 gives age in years (eg. 4 = 5.5 years)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/abalone_data/training_set_label.csv" )
test_df = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/abalone_data/testing_set_label.csv')

In [6]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,F,0.525,0.4,0.135,0.714,0.318,0.138,0.208,10
1,F,0.445,0.325,0.125,0.455,0.1785,0.1125,0.14,9
2,M,0.71,0.54,0.165,1.959,0.7665,0.261,0.78,18
3,F,0.68,0.58,0.2,1.787,0.585,0.453,0.6,19
4,M,0.605,0.47,0.16,1.1735,0.4975,0.2405,0.345,12


In [7]:
df.tail()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
3127,M,0.415,0.315,0.12,0.4015,0.199,0.087,0.097,8
3128,I,0.325,0.24,0.07,0.152,0.0565,0.0305,0.054,8
3129,M,0.565,0.455,0.155,0.9355,0.421,0.183,0.26,11
3130,M,0.61,0.485,0.145,1.3305,0.783,0.2255,0.2865,9
3131,F,0.655,0.505,0.19,1.3485,0.5935,0.2745,0.425,12


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3132 entries, 0 to 3131
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             3132 non-null   object 
 1   Length          3132 non-null   float64
 2   Diameter        3132 non-null   float64
 3   Height          3132 non-null   float64
 4   Whole weight    3132 non-null   float64
 5   Shucked weight  3132 non-null   float64
 6   Viscera weight  3132 non-null   float64
 7   Shell weight    3132 non-null   float64
 8   Rings           3132 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 220.3+ KB


No missing values

In [9]:
df.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,0.523707,0.407952,0.139277,0.82867,0.359229,0.180731,0.239159,9.947957
std,0.119891,0.099336,0.038991,0.49063,0.221946,0.109924,0.13904,3.230252
min,0.11,0.09,0.0,0.008,0.0025,0.0005,0.003,2.0
25%,0.45,0.35,0.115,0.4415,0.185375,0.093375,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.3355,0.17,0.235,10.0
75%,0.615,0.48,0.165,1.153,0.4995,0.2525,0.33,11.0
max,0.815,0.65,0.515,2.8255,1.488,0.76,1.005,29.0


### Visualize Data

In [10]:
df.corr()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
Length,1.0,0.986523,0.890674,0.924719,0.898828,0.901679,0.900571,0.550276
Diameter,0.986523,1.0,0.897447,0.925138,0.894484,0.898299,0.908248,0.567615
Height,0.890674,0.897447,1.0,0.881028,0.834576,0.859091,0.882061,0.589325
Whole weight,0.924719,0.925138,0.881028,1.0,0.971052,0.965639,0.9571,0.534093
Shucked weight,0.898828,0.894484,0.834576,0.971052,1.0,0.933097,0.887878,0.415403
Viscera weight,0.901679,0.898299,0.859091,0.965639,0.933097,1.0,0.908051,0.496904
Shell weight,0.900571,0.908248,0.882061,0.9571,0.887878,0.908051,1.0,0.622351
Rings,0.550276,0.567615,0.589325,0.534093,0.415403,0.496904,0.622351,1.0


### Data Preparation

Label Encode

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
enc = LabelEncoder()
df['Sex'] = enc.fit_transform(df['Sex'])
test_df['Sex'] = enc.transform(test_df['Sex'])

Train / Validation Split

In [13]:
X = df.drop(['Rings'], axis=1).values
y = df['Rings'].values

In [14]:
X_test_pred = test_df.values

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

Scale data

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test_pred = scaler.transform(X_test_pred)

### Model train

In [19]:
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor, plot_tree, plot_importance

In [29]:
dt = LinearRegression()
svc = SVR()
ada_dt = AdaBoostRegressor(base_estimator=dt)
ada_svc = AdaBoostRegressor(base_estimator=svc, n_estimators=10, random_state=0, learning_rate=0.3)
gbm = GradientBoostingRegressor()
xgb = XGBRegressor(objective='reg:squarederror')
rf = RandomForestRegressor(max_leaf_nodes=20, oob_score=True, n_jobs=-1)

In [30]:
dt.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
svc.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [32]:
ada_dt.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=LinearRegression(copy_X=True,
                                                  fit_intercept=True,
                                                  n_jobs=None,
                                                  normalize=False),
                  learning_rate=1.0, loss='linear', n_estimators=50,
                  random_state=None)

In [33]:
ada_svc.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                     epsilon=0.1, gamma='scale', kernel='rbf',
                                     max_iter=-1, shrinking=True, tol=0.001,
                                     verbose=False),
                  learning_rate=0.3, loss='linear', n_estimators=10,
                  random_state=0)

In [34]:
gbm.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [35]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [36]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=20,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=True,
                      random_state=None, verbose=0, warm_start=False)

### Model Evaluation

In [42]:
from sklearn.metrics import mean_squared_error

In [38]:
y_pred_dt = dt.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_ada_dt = ada_dt.predict(X_test)
y_pred_ada_svc = ada_svc.predict(X_test)
y_pred_gbm = gbm.predict(X_test)
y_pred_xgb = xgb.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [50]:
def print_rmse(y_test, y_pred, title):
    print(f'{title} rmse: {mean_squared_error(y_test, y_pred, squared=False):.3f}')

In [51]:
print_rmse(y_test, y_pred_dt, 'Linear Regression')
print_rmse(y_test, y_pred_svc, 'SVC')
print_rmse(y_test, y_pred_ada_dt, 'AdaBoost LR')
print_rmse(y_test, y_pred_ada_svc, 'AdaBoost SVR')
print_rmse(y_test, y_pred_gbm, 'GBM')
print_rmse(y_test, y_pred_xgb, 'XGBoost')
print_rmse(y_test, y_pred_rf, 'Random Forest')

Linear Regression rmse: 2.313
SVC rmse: 2.279
AdaBoost LR rmse: 2.330
AdaBoost SVR rmse: 2.190
GBM rmse: 2.261
XGBoost rmse: 2.232
Random Forest rmse: 2.287


### Make Predictions

In [48]:
pred = ada_svc.predict(X_test_pred)

In [49]:
df_pred = pd.DataFrame({
    'prediction': pred
})

from google.colab import files
df_pred.to_csv('abalone_pred.csv', index=False)
files.download('abalone_pred.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>