In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Set font to match Latex
plt.rcParams.update({
    "text.usetex": False,
    "font.family": "serif",
})

In [34]:
col_names = ['area', 'perimeter', 'compactness','kernel_length','kernel_width','asymmetry_coef','kernel_groove_length','numerical_class']

df = pd.read_csv('seeds.tsv',sep='\t', header=None, names=col_names) 
df

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry_coef,kernel_groove_length,numerical_class
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


In [36]:
#Normalization of data - MinMmax Scaler
scaler_minmax = MinMaxScaler()
norm_df = scaler_minmax.fit_transform(df)
norm_df = pd.DataFrame(norm_df, columns=col_names)
norm_df

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry_coef,kernel_groove_length,numerical_class
0,0.440982,0.502066,0.570780,0.486486,0.486101,0.189302,0.345150,0.0
1,0.405099,0.446281,0.662432,0.368806,0.501069,0.032883,0.215165,0.0
2,0.349386,0.347107,0.879310,0.220721,0.503920,0.251453,0.150665,0.0
3,0.306893,0.316116,0.793103,0.239302,0.533856,0.194243,0.140817,0.0
4,0.524079,0.533058,0.864791,0.427365,0.664291,0.076701,0.322994,0.0
...,...,...,...,...,...,...,...,...
205,0.151086,0.163223,0.637024,0.134009,0.250178,0.372635,0.172821,1.0
206,0.060434,0.097107,0.390200,0.135698,0.117605,0.462872,0.238306,1.0
207,0.246459,0.258264,0.727768,0.189752,0.429081,0.981667,0.264402,1.0
208,0.118036,0.165289,0.399274,0.155405,0.146828,0.368344,0.258493,1.0


In [37]:
#Standardization of data - Standard Scaler
scaler_standard = StandardScaler()
standard_df = scaler_standard.fit_transform(df)
standard_df = pd.DataFrame(standard_df, columns=col_names)
standard_df

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry_coef,kernel_groove_length,numerical_class
0,0.142098,0.215462,0.000061,0.304218,0.141702,-0.986152,-0.383577,-1.224745
1,0.011188,0.008224,0.428515,-0.168625,0.197432,-1.788166,-0.922013,-1.224745
2,-0.192067,-0.360201,1.442383,-0.763637,0.208048,-0.667479,-1.189192,-1.224745
3,-0.347091,-0.475333,1.039381,-0.688978,0.319508,-0.960818,-1.229983,-1.224745
4,0.445257,0.330595,1.374509,0.066666,0.805159,-1.563495,-0.475356,-1.224745
...,...,...,...,...,...,...,...,...
205,-0.915515,-1.043321,0.309736,-1.112048,-0.736716,-0.046135,-1.097413,1.224745
206,-1.246235,-1.288937,-0.844122,-1.105261,-1.230328,0.416540,-0.826156,1.224745
207,-0.567571,-0.690247,0.733948,-0.888070,-0.070604,3.076588,-0.718060,1.224745
208,-1.036090,-1.035645,-0.801701,-1.026077,-1.121521,-0.068135,-0.742535,1.224745
