In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Introduction
In this Notebook, we will demonstrate:
- The basic steps to apply the KNN techique for regression on the abalone dataset
- To show hot to normalise the data
- To show the result or error we get on the regression task
- To show how to make simple predictions

### Import the dataset

This is the file 'abalone.csv'. An **abalone** is a sea snail that looks like:

![Abalone](abalone.png )

The features of this dataset are: 

- Sex (M, F, I)
- Length 
- Diameter
- Height
- Whole weight
- Shucked weight
- Viscera weight
- Shell weight
- Rings

Note that rings is an integer, eac each can be considered a class or we also use it for regression purposes. 
It has been reported elsewhere that a classifcation accuracy of around 16% and a RMSE of around 3.2 rings for the prediction of the number of rings. 

Note that sex has the category I, this stands for infants. 

In [2]:
columns = ['sex', 'length', 'diameter', 'height', 'weight', 'iweight', 'vweight', 'sweight', 'rings']
abalone_data = pd.read_csv('abalone.csv', header=None, names=columns)
abalone_data.tail()

Unnamed: 0,sex,length,diameter,height,weight,iweight,vweight,sweight,rings
4172,F,0.565,0.45,0.165,0.887,0.37,0.239,0.249,11
4173,M,0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
4174,M,0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
4175,F,0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10
4176,M,0.71,0.555,0.195,1.9485,0.9455,0.3765,0.495,12


In [3]:
# change Male to 1 and female to 0, do not run this twice
abalone_data['sex'] = abalone_data['sex'].map({'M':1,'F':0, 'I':0.5});
abalone_data.tail()

Unnamed: 0,sex,length,diameter,height,weight,iweight,vweight,sweight,rings
4172,0.0,0.565,0.45,0.165,0.887,0.37,0.239,0.249,11
4173,1.0,0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
4174,1.0,0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
4175,0.0,0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10
4176,1.0,0.71,0.555,0.195,1.9485,0.9455,0.3765,0.495,12


In [4]:
# Maximum and minimun of rings
print ("Min number of rings:", abalone_data['rings'].min())
print ("Max number of rings:", abalone_data['rings'].max())

Min number of rings: 1
Max number of rings: 29


### Scaling  data to be between 0, 1 per feature

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(abalone_data);
datan = scaler.transform(abalone_data);

ymax = scaler.data_max_[-1]
ymin = scaler.data_min_[-1]

print ("Min number of rings:", ymin)
print ("Max number of rings:", ymax)

Min number of rings: 1.0
Max number of rings: 29.0


### Spliting the normalised data into features and targets

In [6]:
# Define the independent variables as features.
n = abalone_data.shape[0]

X = datan[:,0:-1]
#y = datan[:, -1].reshape(n,1)
y = datan[:, -1]

### Create a regressor, using cross validation

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
knnr = KNeighborsRegressor(n_neighbors=5)

### Perform the regression

In [8]:
scores = cross_val_score(knnr, X, y, scoring = 'neg_mean_squared_error', cv=10)
print ('Mean square error is (neg): {0:2.5f}'.format(scores.mean()))

Mean square error is (neg): -0.00701


### Predicting a particular ring

Here we will look at a single instance of an abalone. We de-normalise it to see how many rings it has.
We fill all data and then estimate the number of rings based on the five closest neighbors. 


In [9]:
# position of abalone we are interested, any index
position = 1000

# denormalise it
a1000 = datan[position]
r1000 = a1000[-1]*(ymax-ymin) + ymin
print ("Abalon at position 1000 has {:1.1f} rings".format(r1000))

# Fit all the data
knnr.fit(X, y);
pr = knnr.predict([a1000[0:-1]])
pr1000 = pr[0]*(ymax-ymin) + ymin
print ("Abalon at position 1000 has {:1.1f} predicted rings".format(pr1000))

Abalon at position 1000 has 11.0 rings
Abalon at position 1000 has 9.6 predicted rings


## Ideas to try

- Remove the infants from the dataset and repeat only for male and female abalone