In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
path = r'C:\Users\utilizador\Documents\a_hjoaquim\a_semestre2-5oano\PSn\abalone.csv'
df = pd.read_csv(path)

In [3]:
df

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole weight (g),Shucked weight (g),Viscera weight (g),Shell weight (g),Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [4]:
### Main Goal: Predict the number of Rings based on the other fields.
### Age = Rings + 1.5
### "Rings" field will be our class, meaning, the thing we want to predict. 
### This is a continuous variable and a supervised learning problem.

In [5]:
### Number of examples by class
df['Rings'].value_counts().sort_index()

1       1
2       1
3      15
4      57
5     115
6     259
7     391
8     568
9     689
10    634
11    487
12    267
13    203
14    126
15    103
16     67
17     58
18     42
19     32
20     26
21     14
22      6
23      9
24      2
25      1
26      1
27      2
29      1
Name: Rings, dtype: int64

In [6]:
### We pretend to make every column numeric
### As we can see, the only one left is Sex
### Let's change it, assuming: M ==> 1 and F ==> 0

In [7]:
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'M' else 0)
df

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole weight (g),Shucked weight (g),Viscera weight (g),Shell weight (g),Rings
0,1,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,1,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,1,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,1,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [8]:
### Let's merge classes 1,2,3,4 and 5
### And the same for 20 to 29
### This way we can make a more even distribution

In [9]:
df['Rings'] = df['Rings'].apply(lambda x: 5 if x <= 5 else x)
df['Rings'] = df['Rings'].apply(lambda x: 20 if x >= 20 else x)
df['Rings'].value_counts().sort_index()

5     189
6     259
7     391
8     568
9     689
10    634
11    487
12    267
13    203
14    126
15    103
16     67
17     58
18     42
19     32
20     62
Name: Rings, dtype: int64

In [10]:
### Let's devide our dataset in predictors and class
predictors = df.drop(['Rings'], axis=1)
ringsClass = df['Rings']

In [11]:
### Let's split between training-set and test-set
X_train,X_test,y_train,y_test = train_test_split(predictors,ringsClass,random_state=0)

In [12]:
### Now, let's make our Knn Classifier
knn=KNeighborsClassifier(n_neighbors=5) # KNN com K=5
knn.fit(X_train,y_train)                # train the classifier
knn.score(X_test,y_test)                # test the result

0.23732057416267943

In [13]:
### As we can see, we got a pretty low score, since we're trying to predict in between 16 classes.
### Let's do as the example's literature suggest and try the following distribution:
### Class 1: 1...8
### Class 2: 10...10
### Class 3: 11...29
df['Rings'] = df['Rings'].apply(lambda x: 1 if x <= 8 else x)
df['Rings'] = df['Rings'].apply(lambda x: 2 if x >= 9 and x <= 10 else x)
df['Rings'] = df['Rings'].apply(lambda x: 3 if x >= 11 else x)
df['Rings'].value_counts().sort_index()

1    1407
2    1323
3    1447
Name: Rings, dtype: int64

In [14]:
### Since we're not talking about 'Rings' anymore, let's change our Class name
df.rename(columns = {'Rings':'Class'}, inplace = True)
df

Unnamed: 0,Sex,Length(mm),Diameter(mm),Height(mm),Whole weight (g),Shucked weight (g),Viscera weight (g),Shell weight (g),Class
0,1,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,3
1,1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,1
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,2
3,1,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,2
4,0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,1
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,3
4173,1,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,2
4174,1,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,2
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,2


In [15]:
### Let's try our Knn classification again
predictors = df.drop(['Class'], axis=1)
Class = df['Class']
X_train,X_test,y_train,y_test = train_test_split(predictors,Class,random_state=0)

In [16]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.6267942583732058