In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [5]:
X = pd.get_dummies(X, prefix_sep='_', drop_first=True)
X

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,1,0
...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,0,0


# Hold out Method

### Train size: 75%  Test size: 25%

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

### Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=42)
dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score
hld75Tree = accuracy_score(y_test, y_pred)
hld75Tree

0.20191387559808613

### KNN

In [40]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh = neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

In [41]:
hld75Knn = accuracy_score(y_test, y_pred)
hld75Tree

0.20191387559808613

### Naive Bayes

In [42]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb = gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

In [43]:
hld75GNB = accuracy_score(y_test, y_pred)
hld75GNB

0.10334928229665072

## Random Subsampling

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### Decision Tree

In [45]:
dtc = DecisionTreeClassifier(random_state=42)
dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [46]:
rnd75Tree = accuracy_score(y_test, y_pred)
rnd75Tree

0.19425837320574163

### KNN 

In [47]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh = neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

In [48]:
rnd75Knn = accuracy_score(y_test, y_pred)
rnd75Knn

0.21722488038277513

### Naive Bayes

In [49]:
gnb = GaussianNB()
gnb = gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

In [50]:
rnd75Gnb = accuracy_score(y_test, y_pred)
rnd75Gnb

0.11674641148325358

# Cross-Validation.

### Decision Tree

In [51]:
from sklearn.model_selection import cross_val_predict
cvTree = DecisionTreeClassifier()
y_pred = cross_val_predict(cvTree, X, y , cv=4)
cv75Tree = accuracy_score(y, y_pred)
cv75Tree

0.19367967440746947

### KNN

In [52]:
cvKnn = KNeighborsClassifier()
y_pred = cross_val_predict(cvKnn, X, y , cv=4)
cv75Knn = accuracy_score(y, y_pred)
cv75Knn

0.2291118027292315

### Naive Bayes

In [53]:
gnb = GaussianNB()
y_pred = cross_val_predict(gnb, X, y , cv=4)
cv75Gnb = accuracy_score(y, y_pred)
cv75Gnb

0.11491501077328226