In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from common import get_data
from scipy.stats import zscore
from collections import defaultdict

In [20]:
training_data = get_data('data/train_selected.csv')
test_data = get_data('data/test_selected_ttf.csv')

In [5]:
test_data.head()

Unnamed: 0,id,cycle,s1,s2,s3,s4,ttf,label_bnc
0,1,31,1398.91,554.42,47.23,521.79,112,0
1,2,49,1410.83,553.52,47.67,521.74,98,0
2,3,126,1418.89,552.59,47.88,520.83,69,0
3,4,106,1406.88,552.64,47.65,521.88,82,0
4,5,98,1419.36,553.29,47.46,521.0,91,0


In [7]:
# Find the number of rows and columns in the training data
training_data.shape

(20631, 8)

In [8]:
# Find the number of rows and columns in the test data
test_data.shape

(100, 8)

# Prepare the base training data for modelling

In [21]:
columns = ['ID', 'CYCLE', 'S1', 'S2', 'S3', 'S4', 'TTF', 'TTF_LABEL']
training_data.columns = columns

In [10]:
# Save training data to csv
training_data.to_csv('data/train_base.csv', index=False)

# Prepare the base testing data for modelling

In [7]:
test_data.head()

Unnamed: 0,id,cycle,s1,s2,s3,s4,ttf,label_bnc
0,1,31,1398.91,554.42,47.23,521.79,112,0
1,2,49,1410.83,553.52,47.67,521.74,98,0
2,3,126,1418.89,552.59,47.88,520.83,69,0
3,4,106,1406.88,552.64,47.65,521.88,82,0
4,5,98,1419.36,553.29,47.46,521.0,91,0


In [12]:
test_data.label_bnc.value_counts()

0    75
1    25
Name: label_bnc, dtype: int64

In [22]:
# Update column names
test_data.columns = columns

In [9]:
test_data.head()

Unnamed: 0,ID,Cycle,S1,S2,S3,S4,TTF,TTF_LABEL
0,1,31,1398.91,554.42,47.23,521.79,112,0
1,2,49,1410.83,553.52,47.67,521.74,98,0
2,3,126,1418.89,552.59,47.88,520.83,69,0
3,4,106,1406.88,552.64,47.65,521.88,82,0
4,5,98,1419.36,553.29,47.46,521.0,91,0


In [15]:
# Save test data to csv
test_data.to_csv('data/test_base.csv', index=False)

In [12]:
pd.concat([training_data.head(3),training_data.tail(3)],axis=0) 

Unnamed: 0,ID,CYCLE,S1,S2,S3,S4,TTF,TTF_LABEL
0,1,1,1400.6,554.36,47.47,521.66,191,0
1,1,2,1403.14,553.75,47.49,522.28,190,0
2,1,3,1404.2,554.26,47.27,522.42,189,0
20628,100,198,1428.18,550.94,48.09,520.01,2,1
20629,100,199,1426.53,550.68,48.39,519.67,1,1
20630,100,200,1432.14,550.79,48.2,519.3,0,1


## Data scaling

In [25]:
scaled_columns = ['S1', 'S2', 'S3', 'S4']

In [10]:
standard_scaler = StandardScaler()

standard_scaler.fit(training_data[columns])

StandardScaler()

In [11]:
X_train = standard_scaler.transform(training_data[columns])
X_test = standard_scaler.transform(test_data[columns])

In [14]:
X_train = pd.DataFrame(X_train, columns=columns)
X_train.head()

Unnamed: 0,S1,S2,S3,S4
0,-0.925936,1.121141,-0.266467,0.334262
1,-0.643726,0.43193,-0.191583,1.174899
2,-0.525953,1.008155,-1.015303,1.364721
3,-0.784831,1.222827,-1.539489,1.961302
4,-0.301518,0.714393,-0.977861,1.052871


In [21]:
X_test = pd.DataFrame(X_test, columns=columns)
X_test.head()

Unnamed: 0,S1,S2,S3,S4
0,-1.113706,1.188932,-1.165071,0.510525
1,0.210682,0.172064,0.48237,0.442731
2,1.106199,-0.878699,1.268649,-0.791108
3,-0.228188,-0.822207,0.407486,0.632553
4,1.158419,-0.087802,-0.303908,-0.56061


In [19]:
scaled_training_data = pd.concat([training_data[["ID", "Cycle", "TTF", "TTF_LABEL"]], X_train], axis=1)
scaled_training_data.to_csv('data/training_standard_scaled.csv', index=False)

In [22]:
scaled_test_data = pd.concat([test_data[["ID", "Cycle", "TTF", "TTF_LABEL"]], X_test], axis=1)
scaled_test_data.to_csv('data/test_standard_scaled.csv', index=False)

### MinMaxScaler

In [10]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(training_data[scaled_columns])

MinMaxScaler()

In [11]:
X_train = min_max_scaler.transform(training_data[scaled_columns])
X_test = min_max_scaler.transform(test_data[scaled_columns])

In [13]:
X_train = pd.DataFrame(X_train, columns=scaled_columns)
X_train.head()

Unnamed: 0,S1,S2,S3,S4
0,0.309757,0.726248,0.369048,0.633262
1,0.352633,0.628019,0.380952,0.765458
2,0.370527,0.710145,0.25,0.795309
3,0.331195,0.740741,0.166667,0.889126
4,0.404625,0.668277,0.255952,0.746269


In [14]:
min_max_scaled_training_data = pd.concat([training_data[["ID", "Cycle", "TTF", "TTF_LABEL"]], X_train], axis=1)
min_max_scaled_training_data.to_csv('data/training_min_max_scaled.csv', index=False)

In [16]:
X_test = pd.DataFrame(X_test, columns=scaled_columns)

In [17]:
min_max_scaled_test_data = pd.concat([test_data[["ID", "Cycle", "TTF", "TTF_LABEL"]], X_test], axis=1)
min_max_scaled_test_data.to_csv('data/test_min_max_scaled.csv', index=False)

In [17]:
quantiles = training_data.quantile([0.25, 0.75])
quantiles.iloc[0, 0]

26.0

In [48]:
z_scores = zscore(training_data[scaled_columns])
z_scores[(z_scores > 3) | (z_scores < -3)].count().sum()

127

In [55]:
k = 3
to_remove = {}
for col in scaled_columns:
    num_out_outliers_index = z_scores[(z_scores[col] > k) | (z_scores[col] < -k)].index
    to_remove[col] = num_out_outliers_index

In [59]:
combined = set()
for indexes in to_remove.values():
    combined.update(set(indexes))
print("Removed:", len(combined))
print(f"This is {len(combined) / training_data.shape[0] * 100}% of our dataset")

Removed: 115
This is 0.5574136008918618% of our dataset
