### Importing required library 

In [12]:
import pandas as pd
from scipy import stats
import numpy as np

### Reading data from one of the datasets and preview it

In [13]:
df = pd.read_pickle("./erml_data/170C278A4218594D54F130A88FF85203.pkl")
df

Unnamed: 0,Temperature,GSR,EOG1,EOG2,EEG1,EEG2,RED_RAW,IR_RAW,Arousal,Dominance,Valence
0,15404.0,14456140.0,15336120.0,16776161.0,4031711.0,14321874.0,63113.0,83255.0,4,5,6
1,15404.0,14341397.0,15336324.0,16776164.0,4483347.0,14300379.0,63113.0,83255.0,4,5,6
2,15404.0,14192337.0,15337673.0,16776194.0,4887349.0,14255666.0,63113.0,83255.0,4,5,6
3,15404.0,14338432.0,15337555.0,16776200.0,4306698.0,14285075.0,63113.0,83255.0,4,5,6
4,15404.0,14421469.0,15337494.0,16776171.0,3842609.0,14288817.0,63113.0,83255.0,4,5,6
...,...,...,...,...,...,...,...,...,...,...,...
277995,15522.0,16384102.0,16529564.0,16776179.0,16513047.0,128986.0,59537.0,78680.0,4,5,6
277996,15522.0,16397812.0,16529640.0,16776184.0,16519450.0,131112.0,59537.0,78680.0,4,5,6
277997,15522.0,16406761.0,16529335.0,16776185.0,16516341.0,130823.0,59537.0,78680.0,4,5,6
277998,15522.0,16392256.0,16529440.0,16776182.0,16518429.0,130300.0,59537.0,78680.0,4,5,6


### Filter out raw data (first 8 columns) of the dataset
* For each subject there are 11 columns in each DataFrame.
* Colums 1-8 are sensor data which are sampled at 250 Hz. Columns 9-11 are the corresponding labels.

In [21]:
df = df.filter(['Temperature', 'GSR', 'EOG1', 'EOG2', 'EEG1', 'EEG2', 'RED_RAW', 'IR_RAW'])

In [15]:
gsr_copy = df.filter(['GSR'])
gsr_copy.describe()

Unnamed: 0,GSR
count,278000.0
mean,14266790.0
std,4814708.0
min,0.0
25%,15369550.0
50%,15886330.0
75%,16282920.0
max,16777210.0


In [16]:
# remove outliers for GSR column, 3 is the threshold to consider
gsr_cln_cpy = gsr_copy[(np.abs(stats.zscore(gsr_copy)) < 3)]
gsr_cln_cpy.count()

GSR    278000
dtype: int64

In [17]:
# implement min-max normalization for GSR column
normalized_gsr = (gsr_copy-gsr_copy.min())/(gsr_copy.max()-gsr_copy.min())
normalized_gsr

Unnamed: 0,GSR
0,0.861653
1,0.854814
2,0.845929
3,0.854637
4,0.859587
...,...
277995,0.976569
277996,0.977386
277997,0.977919
277998,0.977055


In [18]:
# remove outliers for all columns, 3 is the threshold to consider
df_cln_cpy = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
df_cln_cpy.count()

Temperature    277338
GSR            277338
EOG1           277338
EOG2           277338
EEG1           277338
EEG2           277338
RED_RAW        277338
IR_RAW         277338
dtype: int64

In [19]:
norm_result = df.copy()
# implement min-max normalization for all columns
for feature_name in df.columns: 
    max_value = df[feature_name].max()
    min_value = df[feature_name].min()
    norm_result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
norm_result

Unnamed: 0,Temperature,GSR,EOG1,EOG2,EEG1,EEG2,RED_RAW,IR_RAW
0,0.149171,0.861653,0.914107,0.999968,0.240309,0.853650,0.892609,0.871398
1,0.149171,0.854814,0.914119,0.999968,0.267228,0.852369,0.892609,0.871398
2,0.149171,0.845929,0.914200,0.999970,0.291309,0.849704,0.892609,0.871398
3,0.149171,0.854637,0.914193,0.999970,0.256699,0.851457,0.892609,0.871398
4,0.149171,0.859587,0.914189,0.999968,0.229037,0.851680,0.892609,0.871398
...,...,...,...,...,...,...,...,...
277995,0.801105,0.976569,0.985242,0.999969,0.984254,0.007688,0.115217,0.255319
277996,0.801105,0.977386,0.985247,0.999969,0.984636,0.007815,0.115217,0.255319
277997,0.801105,0.977919,0.985228,0.999969,0.984451,0.007798,0.115217,0.255319
277998,0.801105,0.977055,0.985235,0.999969,0.984575,0.007766,0.115217,0.255319


In [23]:
rdf = pd.read_pickle("./erml_data/170C278A4218594D54F130A88FF85203.pkl")
rdf

Unnamed: 0,Temperature,GSR,EOG1,EOG2,EEG1,EEG2,RED_RAW,IR_RAW,Arousal,Dominance,Valence
0,15404.0,14456140.0,15336120.0,16776161.0,4031711.0,14321874.0,63113.0,83255.0,4,5,6
1,15404.0,14341397.0,15336324.0,16776164.0,4483347.0,14300379.0,63113.0,83255.0,4,5,6
2,15404.0,14192337.0,15337673.0,16776194.0,4887349.0,14255666.0,63113.0,83255.0,4,5,6
3,15404.0,14338432.0,15337555.0,16776200.0,4306698.0,14285075.0,63113.0,83255.0,4,5,6
4,15404.0,14421469.0,15337494.0,16776171.0,3842609.0,14288817.0,63113.0,83255.0,4,5,6
...,...,...,...,...,...,...,...,...,...,...,...
277995,15522.0,16384102.0,16529564.0,16776179.0,16513047.0,128986.0,59537.0,78680.0,4,5,6
277996,15522.0,16397812.0,16529640.0,16776184.0,16519450.0,131112.0,59537.0,78680.0,4,5,6
277997,15522.0,16406761.0,16529335.0,16776185.0,16516341.0,130823.0,59537.0,78680.0,4,5,6
277998,15522.0,16392256.0,16529440.0,16776182.0,16518429.0,130300.0,59537.0,78680.0,4,5,6


# Finding outliers in dataset 
### What is an outlier?
An outlier is a data point in a data set that is distant from all other observations. A data point that lies outside the overall distribution of the dataset.

### What are the criteria to identify an outlier?
- Data point that falls outside of 1.5 times of an interquartile range above the 3rd quartile and below the 1st quartile
- Data point that falls outside of 3 standard deviations. we can use a z score and if the z score falls outside of 2 standard deviation

### What is the reason for an outlier to exists in a dataset?
An outlier could exist in a dataset due to:
- Variability in the data
- An experimental measurement error

### What is the impact of an outlier?
Causes serious issues for statistical analysis like:
- Skew the data
- Significant impact on mean
- Significant impact on standard deviation

### How can we identify an outlier?
- Using scatter plots
- Using Z score
- Using the IQR interquartile range
- Using Scatter Plot 

In [24]:
test

NameError: name 'test' is not defined