In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

In [7]:
file = pd.read_csv('../feets_full_table.csv', sep=';')
file.head()

Unnamed: 0,id,sector,Amplitude,Eta_e,LinearTrend,Mean,Meanvariance,MedianAbsDev,PercentAmplitude,PeriodLS,Period_fit,Psi_CS,Psi_eta,Std,Class
0,1152358,22,0.038367,2.781633,8e-06,0.999896,0.01949,0.013162,0.067496,9.1e-05,1.0,0.013116,2.01383,0.019488,Noisy
1,4289780,6,0.003984,0.027998,-3e-06,1.000369,0.002672,0.002411,0.006699,0.853793,0.0,0.230237,0.050262,0.002673,Modul
2,4289780,32,0.004005,0.026885,3e-06,1.000378,0.002702,0.002441,0.006885,0.854841,0.0,0.233032,0.037314,0.002703,Modul
3,4373827,20,0.065865,2.627467,-1.7e-05,0.999851,0.033532,0.02258,0.117115,4.3e-05,1.0,0.019476,1.983205,0.033527,Noisy
4,4373827,44,0.070649,2.481889,3.7e-05,1.000068,0.03581,0.024006,0.123233,0.000106,1.0,0.015542,2.024841,0.035813,Noisy


In [None]:
# Checking the amount of NaN values
print(file["Class"].isna().sum())

0


In [None]:
# Listing thte targets with NaN values in Class column
nan_rows = file[file['Class'].isna()]
nan_rows

Unnamed: 0,id,sector,Amplitude,Eta_e,LinearTrend,Mean,Meanvariance,MedianAbsDev,PercentAmplitude,PeriodLS,Period_fit,Psi_CS,Psi_eta,Std,Class


In [10]:
group = file.groupby('Class')
group.count()

Unnamed: 0_level_0,id,sector,Amplitude,Eta_e,LinearTrend,Mean,Meanvariance,MedianAbsDev,PercentAmplitude,PeriodLS,Period_fit,Psi_CS,Psi_eta,Std
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Modul,1483,1483,1483,1483,1483,1483,1483,1483,1483,1483,1479,1483,1483,1483
Noisy,1719,1719,1719,1719,1719,1719,1719,1719,1719,1719,1715,1719,1719,1719


## Predictors and Class

In [11]:
x_feets = file.iloc[:, 2:14].values
y_feets = file.iloc[:, 14].values
#y_feets = y_feets.reshape(-1, 1)

In [None]:
unique_values = np.unique(y_feets.astype(str))
print(unique_values)

['Modul' 'Noisy']


In [13]:
print(x_feets.shape, type(x_feets))

(3202, 12) <class 'numpy.ndarray'>


In [14]:
print(y_feets.shape, type(y_feets))

(3202,) <class 'numpy.ndarray'>


In [15]:
y_feets

array(['Noisy', 'Modul', 'Modul', ..., 'Modul', 'Modul', 'Modul'],
      shape=(3202,), dtype=object)

## Attribute scaling

In [16]:
# Verifying infinity values
print(f'There is {np.isinf(x_feets).sum()} infinity values')

There is 5 infinity values


In [17]:
# Replacing inf values to nan and checking

x_feets = np.where(np.isinf(x_feets), np.nan, x_feets)

print(f'There is {np.isinf(x_feets).sum()} infinity values')

There is 0 infinity values


In [18]:
# Treating infinity values, transforming them into median value

imputer = SimpleImputer(strategy='median')
x_feets = imputer.fit_transform(x_feets)

In [19]:
scaler_feets = StandardScaler()

x_feets = scaler_feets.fit_transform(x_feets)

In [20]:
print(f'Valores máximos: {np.max(x_feets)}')
print(f'Valores mínimos: {np.min(x_feets)}')

Valores máximos: 43.94946935951369
Valores mínimos: -27.95905210699225


## LabelEncoder

In [21]:
encoder = LabelEncoder()
y_feets = encoder.fit_transform(y_feets)
y_feets

array([1, 0, 0, ..., 0, 0, 0], shape=(3202,))

In [22]:
print(encoder.classes_)
print(np.unique(y_feets))

['Modul' 'Noisy']
[0 1]


## Training and test basis

In [23]:
x_feets_training, x_feets_test, y_feets_training, y_feets_test = train_test_split(x_feets, y_feets, test_size=0.25, random_state=0)

In [24]:
x_feets_training.shape, y_feets_training.shape

((2401, 12), (2401,))

In [25]:
x_feets_test.shape, y_feets_test.shape

((801, 12), (801,))

## Saving the data

In [27]:
with open('../feets_data.pkl', mode='wb') as f:
    pickle.dump([x_feets_training, x_feets_test, y_feets_training, y_feets_test], f)