# CS316 Lab 2: Preprocessing and cleaning the abalone dataset

### Author:

- Name: Huu Khang Nguyen
- Student Number: 7402909
- Python version: 3.9.15

### Import relevant libraries

In [1]:
import pandas as pd

### Load & Initial exploration for the abalone Dataset


In [2]:
columns_name = ["Sex",
                "Length",
                "Diameter",
                "Height",
                "Whole weight",
                "Shucked weight",
                "Viscera weight",
                "Shell weight",
                "Rings"]

abalone_dataset = pd.read_csv('./data/abalone.data', names=columns_name)

In [3]:
abalone_dataset.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
abalone_dataset.shape

(4177, 9)

### (1) Z-score normalization for `Length`

In [5]:
mean = abalone_dataset['Length'].mean()
std = abalone_dataset['Length'].std()
var = abalone_dataset['Length'].var()

print("Mean: {}".format(mean))
print("Standard deviation: {}".format(std))
print("Variance: {}".format(var))

Mean: 0.5239920995930094
Standard deviation: 0.12009291256479956
Variance: 0.014422307648296592


In [6]:
# Z score normalisation
abalone_dataset['Normalized Length'] = (abalone_dataset['Length'] - mean) / std


In [7]:
abalone_dataset['Normalized Length'].head()

0   -0.574489
1   -1.448812
2    0.050027
3   -0.699393
4   -1.615350
Name: Normalized Length, dtype: float64

In [8]:
print("Normalized Length Mean: {}".format(abalone_dataset['Normalized Length'].mean()))
print("Normalized Length Standard deviation: {}".format(abalone_dataset['Normalized Length'].std()))
print("Normalized Length Variance: {}".format(abalone_dataset['Normalized Length'].var()))

Normalized Length Mean: -5.919771894769329e-16
Normalized Length Standard deviation: 1.0
Normalized Length Variance: 1.0


### (2) Create five bins for the attribute `Diameter`

Using `qcut()` for the appoximately same number of sample each bins, bins number (`q` parameter) will equal to 5

In [9]:
binned_labels = ['xs', 's', 'md', 'lg', 'xl']
binned_diameter, bins = pd.qcut(abalone_dataset['Diameter'], q=5, labels=['xs', 's', 'md', 'lg', 'xl'], retbins=True)
binned_diameter.value_counts()

md    902
xs    863
s     820
lg    803
xl    789
Name: Diameter, dtype: int64

In [10]:
abalone_dataset['Diameter Binned'] = binned_diameter

In [11]:
for i, label in enumerate(binned_labels):
    print(f"{label} ({bins[i]}, {bins[i+1]}]")

xs (0.055, 0.325]
s (0.325, 0.395]
md (0.395, 0.45]
lg (0.45, 0.495]
xl (0.495, 0.65]


Filter data for each bin

In [12]:
for label in binned_labels:
    print(f"Bin {label} diameter")
    df = abalone_dataset.loc[abalone_dataset['Diameter Binned'] == label]
    print(df['Diameter'].head())


Bin xs diameter
1     0.265
4     0.255
5     0.300
16    0.280
18    0.295
Name: Diameter, dtype: float64
Bin s diameter
0     0.365
3     0.365
8     0.370
10    0.380
11    0.350
Name: Diameter, dtype: float64
Bin md diameter
2     0.420
6     0.415
7     0.425
9     0.440
13    0.405
Name: Diameter, dtype: float64
Bin lg diameter
24    0.480
28    0.475
30    0.470
36    0.475
67    0.495
Name: Diameter, dtype: float64
Bin xl diameter
31    0.560
32    0.525
33    0.550
34    0.550
81    0.510
Name: Diameter, dtype: float64


In [13]:
abalone_dataset['Diameter Binned'] 

0        s
1       xs
2       md
3        s
4       xs
        ..
4172    md
4173    md
4174    lg
4175    lg
4176    xl
Name: Diameter Binned, Length: 4177, dtype: category
Categories (5, object): ['xs' < 's' < 'md' < 'lg' < 'xl']

### (3) One-hot-encoding the `Sex` attribute

In [14]:
encoded_sex = pd.get_dummies(abalone_dataset['Sex'], prefix="Sex")

In [15]:
abalone_dataset = abalone_dataset.join(encoded_sex)

In [16]:
abalone_dataset.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Normalized Length,Diameter Binned,Sex_F,Sex_I,Sex_M
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,-0.574489,s,0,0,1
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,-1.448812,xs,0,0,1
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0.050027,md,1,0,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,-0.699393,s,0,0,1
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,-1.61535,xs,0,1,0


Show the unique one_hot_encoding values of the `Sex` attribute by dropping duplicate rows in the dataset

In [17]:
unique_encoded_sex = encoded_sex.drop_duplicates()

In [18]:
unique_encoded_sex

Unnamed: 0,Sex_F,Sex_I,Sex_M
0,0,0,1
2,1,0,0
4,0,1,0


### (4) find and rank correlations between `Rings` with other continous values

In [19]:
continous_cols = ["Length",
                  "Diameter",
                  "Height",
                  "Whole weight",
                  "Shucked weight",
                  "Viscera weight",
                  "Shell weight"]

map = {}
for col in continous_cols:
    key = 'Correlation between Rings and {}'.format(col)
    map[key] = abalone_dataset['Rings'].corr(abalone_dataset[col])


In [20]:
asc_corr = sorted(map.items(), key=lambda item: item[1], reverse=True)

In [21]:
for index,corr in enumerate(asc_corr):
    print(f"Rank {index+1}.", corr[0], corr[1])

Rank 1. Correlation between Rings and Shell weight 0.6275740445103217
Rank 2. Correlation between Rings and Diameter 0.5746598513059187
Rank 3. Correlation between Rings and Height 0.5574673244580373
Rank 4. Correlation between Rings and Length 0.5567195769296177
Rank 5. Correlation between Rings and Whole weight 0.5403896769239008
Rank 6. Correlation between Rings and Viscera weight 0.5038192487597712
Rank 7. Correlation between Rings and Shucked weight 0.42088365794521454


### (5) Define 1 new attribute into the dataframe

I defined a variable `Age` here as this attribute can be calculated by number of rings + 1.5

In [22]:
abalone_dataset['Age'] = abalone_dataset['Rings'] + 1.5

In [23]:
abalone_dataset.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Normalized Length,Diameter Binned,Sex_F,Sex_I,Sex_M,Age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,-0.574489,s,0,0,1,16.5
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,-1.448812,xs,0,0,1,8.5
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0.050027,md,1,0,0,10.5
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,-0.699393,s,0,0,1,11.5
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,-1.61535,xs,0,1,0,8.5


Perfect correlation, `Age` goes up if `Rings` goes up and vice-versa

In [24]:
abalone_dataset['Rings'].corr(abalone_dataset['Age'])

1.0