In [1]:
! wget -N http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data

--2018-05-25 05:54:57--  http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘archive.ics.uci.edu’


In [72]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
data = pd.read_csv('abalone.data', names=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'])
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


Now let's convert categorical feature 'Sex' to numerical via one-hot encoding

In [5]:
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


## Analysis

In [6]:
data.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684,0.312904,0.321283,0.365813
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169,0.463731,0.467025,0.481715
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0,0.0,0.0,0.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0,0.0,0.0,0.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0,0.0,0.0,0.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0,1.0,1.0,1.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0,1.0,1.0,1.0


In [77]:
data.corr()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
Length,1.0,0.986812,0.827554,0.925261,0.897914,0.903018,0.897706,0.55672,0.309666,-0.551465,0.236543
Diameter,0.986812,1.0,0.833684,0.925452,0.893162,0.899724,0.90533,0.57466,0.318626,-0.564315,0.240376
Height,0.827554,0.833684,1.0,0.819221,0.774972,0.798319,0.817338,0.557467,0.298421,-0.518552,0.215459
Whole weight,0.925261,0.925452,0.819221,1.0,0.969405,0.966375,0.955355,0.54039,0.299741,-0.557592,0.252038
Shucked weight,0.897914,0.893162,0.774972,0.969405,1.0,0.931961,0.882617,0.420884,0.263991,-0.521842,0.251793
Viscera weight,0.903018,0.899724,0.798319,0.966375,0.931961,1.0,0.907656,0.503819,0.308444,-0.556081,0.242194
Shell weight,0.897706,0.90533,0.817338,0.955355,0.882617,0.907656,1.0,0.627574,0.306319,-0.546953,0.235391
Rings,0.55672,0.57466,0.557467,0.54039,0.420884,0.503819,0.627574,1.0,0.250279,-0.436063,0.181831
Sex_F,0.309666,0.318626,0.298421,0.299741,0.263991,0.308444,0.306319,0.250279,1.0,-0.464298,-0.512528
Sex_I,-0.551465,-0.564315,-0.518552,-0.557592,-0.521842,-0.556081,-0.546953,-0.436063,-0.464298,1.0,-0.522541


In [7]:
X = data.drop(columns=['Rings'])
X = StandardScaler().fit_transform(X)
y = data['Rings']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

## Classification

In [71]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [79]:
def score(model):
    model.fit(X_train, y_train)
    print('Train score: {}'.format(model.score(X_train, y_train)))
    print('Test score: {}'.format(model.score(X_test, y_test)))

## K-Neighbors

In [80]:
score(KNeighborsClassifier(29))

Train score: 0.3273766976411723
Test score: 0.24945612762871647


## SVM + linear kernel

In [81]:
score(SVC(kernel='linear'))

Train score: 0.27662616154395997
Test score: 0.25598259608411894


## Decision tree

In [82]:
score(DecisionTreeClassifier(max_depth=4))

Train score: 0.2948534667619728
Test score: 0.2574329224075417


## Random forest

In [83]:
score(RandomForestClassifier(max_depth=4, n_estimators=10, max_features=2))

Train score: 0.3012866333095068
Test score: 0.25670775924583034


## Multi-layer perceptron

In [84]:
score(MLPClassifier(alpha=1))

Train score: 0.29592566118656183
Test score: 0.2625090645395214


## AdaBoost

In [85]:
score(AdaBoostClassifier())

Train score: 0.21300929235167976
Test score: 0.2189992748368383


## Regression

In [89]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

## Linear regression

In [86]:
score(LinearRegression())

Train score: 0.5346804750082439
Test score: 0.5423371485898663


## SVM + RBF kernel

In [87]:
score(SVR(kernel='rbf', C=1e3, gamma=0.1))

Train score: 0.6513960441280235
Test score: 0.4730994087829582


## SVM + polynomial kernel

In [88]:
score(SVR(kernel='poly', C=1e3, degree=2))

Train score: 0.3880144594339444
Test score: 0.2779446024994301
