In [34]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
abalone = fetch_ucirepo(id=1)

# data (as pandas dataframes)
X = abalone.data.features
y = abalone.data.targets

X.head()


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [35]:
#LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X['Sex_label'] = label_encoder.fit_transform(X['Sex'])
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_label
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,2
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,2
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,2
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,1


In [36]:
#OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder


ordinal_encoder = OrdinalEncoder(categories=[['I', 'M', 'F']])
X['Sex_ordinal'] = ordinal_encoder.fit_transform(X[['Sex']])

# Display the dataframe with the new 'Sex_ordinal' column
X


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_label,Sex_ordinal
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,2,1.0
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,2,1.0
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,0,2.0
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,2,1.0
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,1,0.0
...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,0,2.0
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,2,1.0
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,2,1.0
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,0,2.0


In [41]:
#OneHotEncoder and Frequency Encoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
sex_column = X.iloc[:, 0]

onehot_encoder = OneHotEncoder()
sex_onehot = onehot_encoder.fit_transform(sex_column.values.reshape(-1, 1)).toarray()

label_encoder = LabelEncoder()
sex_labels = label_encoder.fit_transform(sex_column)
sex_freq = X['Sex'].map(X['Sex'].value_counts(normalize=True))
sex_freq_df = pd.DataFrame({'sex_freq': sex_freq})


sex_onehot_df = pd.DataFrame(sex_onehot, columns=onehot_encoder.get_feature_names_out(['sex']))

X_encoded = pd.concat([X, sex_onehot_df, sex_freq_df], axis=1)

X_encoded.head(100)


0    0.365813
1    0.365813
2    0.312904
3    0.365813
4    0.321283
Name: Sex, dtype: float64
   sex_freq
0  0.365813
1  0.365813
2  0.312904
3  0.365813
4  0.321283


Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_label,Sex_ordinal,sex_F,sex_I,sex_M,sex_freq
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.150,2,1.0,0.0,0.0,1.0,0.365813
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.070,2,1.0,0.0,0.0,1.0,0.365813
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.210,0,2.0,1.0,0.0,0.0,0.312904
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.155,2,1.0,0.0,0.0,1.0,0.365813
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.055,1,0.0,0.0,1.0,0.0,0.321283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,M,0.665,0.535,0.195,1.6060,0.5755,0.3880,0.480,2,1.0,0.0,0.0,1.0,0.365813
96,M,0.535,0.435,0.150,0.7250,0.2690,0.1385,0.250,2,1.0,0.0,0.0,1.0,0.365813
97,M,0.470,0.375,0.130,0.5230,0.2140,0.1320,0.145,2,1.0,0.0,0.0,1.0,0.365813
98,M,0.470,0.370,0.130,0.5225,0.2010,0.1330,0.165,2,1.0,0.0,0.0,1.0,0.365813
