# CS316 Lab 2: Preprocessing and cleaning the abalone dataset

### Author:

- Name: Huu Khang Nguyen
- Student Number: 7402909

In [None]:
### Import relevant libraries

In [3]:
import pandas as pd


### Load & Initial exploration for the abalone Dataset


In [12]:
columns_name = ["Sex",
                "Length",
                "Diameter",
                "Height",
                "Whole weight",
                "Shucked weight",
                "Viscera weight",
                "Shell weight",
                "Rings"]

abalone_dataset = pd.read_csv('./data/abalone.data', names=columns_name)


In [13]:
abalone_dataset.head()


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [15]:
abalone_dataset.shape


(4177, 9)

### (1) Z-score normalization for `Length`

In [52]:
mean = abalone_dataset['Length'].mean()
std = abalone_dataset['Length'].std()
var = abalone_dataset['Length'].var()

print("Mean: {}".format(mean))
print("Standard deviation: {}".format(std))
print("Variance: {}".format(var))

Mean: 0.5239920995930094
Standard deviation: 0.12009291256479956
Variance: 0.014422307648296592


In [49]:
# Z score normalisation
abalone_dataset['Normalized Length'] = (abalone_dataset['Length'] - mean) / std


In [50]:
abalone_dataset['Normalized Length'].head()

0   -0.574489
1   -1.448812
2    0.050027
3   -0.699393
4   -1.615350
Name: Normalized Length, dtype: float64

In [51]:
print("Normalized Length Mean: {}".format(abalone_dataset['Normalized Length'].mean()))
print("Normalized Length Standard deviation: {}".format(abalone_dataset['Normalized Length'].std()))
print("Normalized Length Variance: {}".format(abalone_dataset['Normalized Length'].var()))

Normalized Length Mean: -5.919771894769329e-16
Normalized Length Standard deviation: 1.0
Normalized Length Variance: 1.0


### (2) Create five bins for the attribute `Diameter`

Using `qcut()` for the appoximately same number of sample each bins, bins (`q`) will equal to 5

In [66]:
binned_diameter = pd.qcut(abalone_dataset['Diameter'], q=5)
binned_diameter.value_counts()


(0.395, 0.45]     902
(0.054, 0.325]    863
(0.325, 0.395]    820
(0.45, 0.495]     803
(0.495, 0.65]     789
Name: Diameter, dtype: int64

In [67]:
abalone_dataset['Diameter Binned'] = binned_diameter

In [68]:
abalone_dataset['Diameter Binned'] 

0       (0.325, 0.395]
1       (0.054, 0.325]
2        (0.395, 0.45]
3       (0.325, 0.395]
4       (0.054, 0.325]
             ...      
4172     (0.395, 0.45]
4173     (0.395, 0.45]
4174     (0.45, 0.495]
4175     (0.45, 0.495]
4176     (0.495, 0.65]
Name: Diameter Binned, Length: 4177, dtype: category
Categories (5, interval[float64, right]): [(0.054, 0.325] < (0.325, 0.395] < (0.395, 0.45] < (0.45, 0.495] < (0.495, 0.65]]

### (3) One-hot-encoding the `Sex` attribute

In [75]:
encoded_sex = pd.get_dummies(abalone_dataset['Sex'], prefix="Sex")

In [84]:
abalone_dataset = abalone_dataset.join(encoded_sex)

In [85]:
abalone_dataset.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Normalized Length,Diameter Binned,Sex_F,Sex_I,Sex_M
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,-0.574489,"(0.325, 0.395]",0,0,1
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,-1.448812,"(0.054, 0.325]",0,0,1
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0.050027,"(0.395, 0.45]",1,0,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,-0.699393,"(0.325, 0.395]",0,0,1
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,-1.61535,"(0.054, 0.325]",0,1,0


Show the unique one_hot_encoding values of the `Sex` attribute by dropping duplicate rows in the dataset

In [116]:
unique_encoded_sex = encoded_sex.drop_duplicates()

In [117]:
unique_encoded_sex

Unnamed: 0,Sex_F,Sex_I,Sex_M
0,0,0,1
2,1,0,0
4,0,1,0


### (4) find and rank correlations between Rings with other continous values

In [100]:
continous_cols = ["Length",
                  "Diameter",
                  "Height",
                  "Whole weight",
                  "Shucked weight",
                  "Viscera weight",
                  "Shell weight"]

map = {}
for col in continous_cols:
    key = 'Correlation between Rings and {}'.format(col)
    map[key] = abalone_dataset['Rings'].corr(abalone_dataset[col])


In [111]:
asc_corr = sorted(dict.items(), key=lambda item: item[1], reverse=True)

In [113]:
for corr in asc_corr:
    print(corr[0], corr[1])

Correlation between Rings and Shell weight 0.6275740445103217
Correlation between Rings and Diameter 0.5746598513059187
Correlation between Rings and Height 0.5574673244580373
Correlation between Rings and Length 0.5567195769296177
Correlation between Rings and Whole weight 0.5403896769239008
Correlation between Rings and Viscera weight 0.5038192487597712
Correlation between Rings and Shucked weight 0.42088365794521454


### (5) Define 1 new attribute into the dataframe

abalone_dataset['']