In [None]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
%matplotlib inline

from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import  train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import  RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.linear_model import  Ridge
from sklearn.svm import SVR
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Read the dataset 
data = pd.read_csv('abalone.csv')

From problem statement and feature discription, let's first compute the target varible of the problem ' Age' and assign it to the dataset. Age = 1.5+Rings

In [None]:
 data['age'] = data['Rings']+1.5
 data.drop('Rings', axis = 1, inplace = True)

Univariate analysis

Understanding feature wise statistics using various inbuilt tools

In [None]:
print('This dataset has {} observations with {} features.'.format(data.shape[0], data.shape[1]))

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

No Missing Values

In [None]:
data.hist(figsize=(20,10), grid=False, layout=(2, 4), bins = 30)

In [None]:
num_col = data.select_dtypes(include = [np.number]).columns
cat_col = data.select_dtypes(include = [np.object]).columns

In [None]:
num_col

In [None]:
cat_col

In [None]:
skew_values = skew(data[num_col], nan_policy = 'omit')
dummy = pd.concat([pd.DataFrame(list(num_col), columns=['Features']), pd.DataFrame(list(skew_values), columns=['Skewness degree'])], axis = 1)
dummy.sort_values(by = 'Skewness degree' , ascending = False)

For normally distributed data, the skewness should be about 0. For unimodal continuous distributions, a skewness value > 0 means that there is more weight in the right tail of the distribution. 

In [None]:
sns.countplot(x = 'Sex', data = data)

In [None]:
plt.figure(figsize = (20,7))
sns.swarmplot(x = 'Sex', y = 'age', data = data, hue = 'Sex')
sns.violinplot(x = 'Sex', y = 'age', data = data)

Male : age majority lies in between 7.5 years to 19 years

Female: age majority lies in between 8 years to 19 years

Immature: age majority lies in between 6 years to < 10 years

In [None]:
data.groupby('Sex')[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight','Viscera weight', 'Shell weight', 'age']].mean().sort_values('age')

# Bivariate analysis


Bivariate analysis is vital part of data analysis process for, it gives clear picture on how each features are affected in presence of other features.
It also helps us understand and identify significance features, overcome multi-collinearity effect, inter-dependency and thus, provides insights on hidden data noise pattern.

In [None]:
sns.pairplot(data[num_col])

 length is linearly correlated with diameter while, non-linear relation with height, whole weight, shucked weight, viscera weight and shell weight

In [None]:
plt.figure(figsize=(20,7))
sns.heatmap(data[num_col].corr(), annot=True)

# Outlier Handling

In [None]:
data = pd.get_dummies(data)
dummy_data = data.copy()

In [None]:
data.boxplot( rot = 90, figsize=(20,5))

In [None]:
var = 'Viscera weight'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
# outliers removal
data.drop(data[(data['Viscera weight']> 0.5) & (data['age'] < 20)].index, inplace=True)
data.drop(data[(data['Viscera weight']<0.5) & (data['age'] > 25)].index, inplace=True)

In [None]:
var = 'Shell weight'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
data.drop(data[(data['Shell weight']> 0.6) & (data['age'] < 25)].index, inplace=True)
data.drop(data[(data['Shell weight']<0.8) & (data['age'] > 25)].index, inplace=True)

In [None]:
var = 'Shucked weight'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
data.drop(data[(data['Shucked weight']>= 1) & (data['age'] < 20)].index, inplace=True)
data.drop(data[(data['Shucked weight']<1) & (data['age'] > 20)].index, inplace=True)

In [None]:
var = 'Whole weight'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
data.drop(data[(data['Whole weight']>= 2.5) & (data['age'] < 25)].index, inplace=True)
data.drop(data[(data['Whole weight']<2.5) & (data['age'] > 25)].index, inplace=True)

In [None]:
var = 'Diameter'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
data.drop(data[(data['Diameter']<0.1) & (data['age'] < 5)].index, inplace=True)
data.drop(data[(data['Diameter']<0.6) & (data['age'] > 25)].index, inplace=True)
data.drop(data[(data['Diameter']>=0.6) & (data['age']< 25)].index, inplace=True)

In [None]:
var = 'Height'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
data.drop(data[(data['Height']>0.4) & (data['age'] < 15)].index, inplace=True)
data.drop(data[(data['Height']<0.4) & (data['age'] > 25)].index, inplace=True)

In [None]:
var = 'Length'
plt.scatter(x = data[var], y = data['age'],)
plt.grid(True)

In [None]:
data.drop(data[(data['Length']<0.1) & (data['age'] < 5)].index, inplace=True)
data.drop(data[(data['Length']<0.8) & (data['age'] > 25)].index, inplace=True)
data.drop(data[(data['Length']>=0.8) & (data['age']< 25)].index, inplace=True)

## Preprocessing

In [None]:
X = data.drop('age', axis = 1)
y = data['age']

In [None]:
standardScale = StandardScaler()
standardScale.fit_transform(X)

In [None]:
standardScale = StandardScaler()
standardScale.fit_transform(X)

selectkBest = SelectKBest()
X_new = selectkBest.fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.25)

## Modelling

In [None]:
np.random.seed(25)
def rmse_cv(model, X_train, y):
    rmse =- (cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv=5))
    return(rmse*100)

models = [LinearRegression(),
             Ridge(),
             SVR(),
             RandomForestRegressor(),
             GradientBoostingRegressor(),
             KNeighborsRegressor(n_neighbors = 4),]

names = ['LR','Ridge','svm','GNB','RF','GB','KNN']

for model,name in zip(models,names):
    score = rmse_cv(model,X_train,y_train)
    print("{}    : {:.6f}, {:4f}".format(name,score.mean(),score.std()))

In [None]:
# Choosing the MOst Simplest Ridge as best model due to less number of variables and calculationg RMSE and R2 Score

ridge = Ridge(alpha = 0.1,random_state=10)
ridge.fit(X_train,y_train)
pred_ridge = ridge.predict(X_test)
print ("\nModel Report")
print( "RMSE : %.4g" % mean_squared_error(y_test, pred_ridge))
print( "R2 Score : %f" % r2_score(y_test, pred_ridge))

In [None]:
# hyperparameter tunning using GrideSearchCV
from sklearn.model_selection import  GridSearchCV
param  = {'alpha':[0.01, 0.1, 1,10,100],
         'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
glrm0 = GridSearchCV(estimator = Ridge(random_state=10,),
param_grid = param,scoring= 'r2' ,cv = 5,  n_jobs = -1)
glrm0.fit(X_train, y_train)
glrm0.best_params_, glrm0.best_score_

In [None]:
ridge2 = Ridge(alpha = 0.1, random_state=10)
ridge2.fit(X_train,y_train)
pred_ridge = ridge2.predict(X_test)
print ("\nModel Report")
print( "RMSE : %.4g" % mean_squared_error(y_test, pred_ridge))
print( "R2 Score : %f" % r2_score(y_test, pred_ridge))

RMSE score has improved slightly while, R2_score has decreased showing base model was overfitted. Using above process multiple options can be tried to come up with much more robust model.