# Data Preprocessing

In [21]:
%matplotlib inline

from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns

In [22]:
df = pd.read_csv("./data/HTRU2/HTRU_2.csv", names = ['Profile_mean', 'Profile_stdev', 'Profile_skewness', 
                                                      'Profile_kurtosis', 'DM_mean', 'DM_stdev', 'DM_skewness',
                                                      'DM_kurtosis', 'class'])

df

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis,class
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


### Lost Values

In [23]:
df.isnull().values.any() # Has no NaN/lost values 


False

In [24]:
df.isna().values.any()

False

### Outliers

## Standarization

In [25]:
scaler = StandardScaler()

scaledData = scaler.fit_transform(df.drop(['class'], axis = 1))

stdDf = pd.DataFrame(scaledData, columns = df.columns[:-1])
stdDfWithClass = pd.concat([stdDf, df[['class']]], axis = 1)

In [26]:
stdDfWithClass.to_csv("./data/stdHTRU_2.csv", index = False)

## Feature Extraction

Correlation Matrix of Data:

In [31]:
corrStd = stdDf.corr()
#sns.heatmap(corrStd)
corrStd.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis
Profile_mean,1.0,0.547137,-0.873898,-0.738775,-0.298841,-0.307016,0.234331,0.144033
Profile_stdev,0.547137,1.0,-0.521435,-0.539793,0.00686873,-0.0476316,0.0294294,0.0276915
Profile_skewness,-0.873898,-0.521435,1.0,0.945729,0.414368,0.43288,-0.341209,-0.214491
Profile_kurtosis,-0.738775,-0.539793,0.945729,1.0,0.412056,0.41514,-0.328843,-0.204782
DM_mean,-0.298841,0.00686873,0.414368,0.412056,1.0,0.796555,-0.615971,-0.354269
DM_stdev,-0.307016,-0.0476316,0.43288,0.41514,0.796555,1.0,-0.809786,-0.5758
DM_skewness,0.234331,0.0294294,-0.341209,-0.328843,-0.615971,-0.809786,1.0,0.923743
DM_kurtosis,0.144033,0.0276915,-0.214491,-0.204782,-0.354269,-0.5758,0.923743,1.0


In order to improve the performance of ML models that will be affected by the correlation of features and irrelevant variables, we will remove the correlated features (with correlation higher than 0.9).

In [32]:
features = np.full((corrStd.shape[0],), True, dtype=bool)
for i in range(corrStd.shape[0]):
    for j in range(i+1, corrStd.shape[0]):
        if corrStd.iloc[i,j] >= 0.9:
            if features[j]:
                features[j] = False

selectedFeatures = stdDf.columns[features]

noCorrStdData = stdDf[selectedFeatures]

In [33]:
noCorrStdDfWithClassData = pd.concat([noCorrStdData, df[['class']]], axis = 1)

In [34]:
corrNoCorrStd = noCorrStdDfWithClassData.corr()
corrNoCorrStd.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,DM_mean,DM_stdev,DM_skewness,class
Profile_mean,1.0,0.547137,-0.873898,-0.298841,-0.307016,0.234331,-0.673181
Profile_stdev,0.547137,1.0,-0.521435,0.00686873,-0.0476316,0.0294294,-0.363708
Profile_skewness,-0.873898,-0.521435,1.0,0.414368,0.43288,-0.341209,0.791591
DM_mean,-0.298841,0.00686873,0.414368,1.0,0.796555,-0.615971,0.400876
DM_stdev,-0.307016,-0.0476316,0.43288,0.796555,1.0,-0.809786,0.491535
DM_skewness,0.234331,0.0294294,-0.341209,-0.615971,-0.809786,1.0,-0.390816
class,-0.673181,-0.363708,0.791591,0.400876,0.491535,-0.390816,1.0


In [35]:
noCorrStdDfWithClassData.to_csv("./data/noCorrStdHTRU_2.csv", index = False)