In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
import sys
import os
# print(f'{os.path.dirname(os.getcwd())}\Imputers')
sys.path.append(f'{os.path.dirname(os.getcwd())}')
from Imputers.utils import simulate_nan

In [2]:
## PARAMS
n_feat = 128
missing_rate = 0.25
## ===========================

row_count = 0
df = pd.read_csv('./data/Gas Sensor Drift Dataset/bat/batch1.dat', delimiter=' ', header=None)
for i in range(2,11):
    df2 = pd.read_csv(f'./data/Gas Sensor Drift Dataset/bat/batch{i}.dat',delimiter=' ', header=None)
    df = pd.concat([df, df2], axis=0, ignore_index=True)

# split label
df[['Gas Class', 'Gas Num']] = df[0].str.split(';', expand=True).astype('float')

# clean each feature
for i in df.columns:
    if i in ['Gas Class', 'Gas Num', 129,0]:
        continue
    else:
        df[i] = df[i].str.split(':').str.get(1).astype('float')
        df = df.rename(columns={i: f'feat_{i}'})

# drop empty column
df.drop([0,129], axis=1, inplace=True)

# transform label into name
replace_mapping = {
    1.0 : '1',
    2.0 : '2',
    3.0 : '3',
    4.0 : '4',
    5.0 : '5',
    6.0 : '6',
}
df['Gas Class'].replace(replace_mapping, inplace=True)
display(df)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_121,feat_122,feat_123,feat_124,feat_125,feat_126,feat_127,feat_128,Gas Class,Gas Num
0,15596.1621,1.868245,2.371604,2.803678,7.512213,-2.739388,-3.344671,-4.847512,15326.6914,1.768526,...,3037.0390,3.972203,0.527291,0.728443,1.445783,-0.545079,-0.902241,-2.654529,1,10.0
1,26402.0704,2.532401,5.411209,6.509906,7.658469,-4.722217,-5.817651,-7.518333,23855.7812,2.164706,...,4176.4453,4.281373,0.980205,1.628050,1.951172,-0.889333,-1.323505,-1.749225,1,20.0
2,42103.5820,3.454189,8.198175,10.508439,11.611003,-7.668313,-9.478675,-12.230939,37562.3008,2.840403,...,5914.6685,5.396827,1.403973,2.476956,3.039841,-1.334558,-1.993659,-2.348370,1,30.0
3,42825.9883,3.451192,12.113940,16.266853,39.910056,-7.849409,-9.689894,-11.921704,38379.0664,2.851173,...,6147.4744,5.501071,1.981933,3.569823,4.049197,-1.432205,-2.146158,-2.488957,1,40.0
4,58151.1757,4.194839,11.455096,15.715298,17.654915,-11.083364,-13.580692,-16.407848,51975.5899,3.480866,...,8158.6449,7.174334,1.993808,3.829303,4.402448,-1.930107,-2.931265,-4.088756,1,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,28929.5108,3.309200,4.393300,5.057000,9.423300,-3.879400,-5.415100,-15.933500,42203.8965,3.377800,...,5480.6582,2.851000,1.046500,2.039900,5.341900,-0.842830,-1.407700,-4.447900,5,50.0
13906,40392.1230,3.280000,10.268900,13.585900,20.979000,-8.173200,-10.859100,-31.802700,40322.3164,2.594000,...,15700.0969,5.216200,5.167500,8.995000,11.591700,-4.495400,-7.655300,-12.767100,4,250.0
13907,183271.5898,15.594500,45.480300,63.877500,81.139600,-29.539100,-47.626600,-329.266900,229854.3418,14.519400,...,50936.2070,17.808000,17.288700,26.789500,33.366100,-15.823700,-28.516200,-56.332800,5,600.0
13908,29059.8516,2.568000,6.714200,8.786500,13.987800,-6.041100,-8.070700,-22.322300,28441.1289,2.069400,...,10079.5098,3.539900,3.266000,6.249900,9.026200,-2.849500,-4.734300,-9.573100,4,150.0


In [3]:
# Check whether each class has correct amounts of data.
print(df['Gas Class'].value_counts())


Gas Class
5    3009
2    2926
1    2565
4    1936
6    1833
3    1641
Name: count, dtype: int64


In [4]:
# grouped_df = df[['Gas Class','Gas Num']].groupby('Gas Class')
# display(grouped_df.max())
# display(grouped_df.min())
# display(df[df['Gas Num']==600][df['Gas Class']==1])
# display(df['Gas Num'][650:700])

# no nan values
print(df.isna().sum().sum())

0


In [5]:
## feature selection
X = df[df.columns].drop('Gas Class',axis=1)
Y = df['Gas Class']
fit = SelectKBest(score_func=f_classif, k='all').fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

## generate board 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']  #title
display(featureScores.nlargest(n_feat,'Score'))  #see the highest 30 and select the best combanition

selected_cols = featureScores.nlargest(n_feat,'Score').reset_index(drop=True)['Feature'].values

Unnamed: 0,Feature,Score
8,feat_9,3415.946684
64,feat_65,2702.009691
10,feat_11,2598.253410
11,feat_12,2320.797363
66,feat_67,2199.617149
...,...,...
31,feat_32,24.852618
39,feat_40,14.807708
28,feat_29,14.328107
49,feat_50,2.841583


In [6]:
X = df[selected_cols]
display(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1105)
X_train.to_csv('./data/Gas Sensor Drift Dataset/complete/X_train.csv', index=False)
X_test.to_csv('./data/Gas Sensor Drift Dataset/complete/X_test.csv', index=False)
y_train.to_csv('./data/Gas Sensor Drift Dataset/complete/y_train.csv', index=False)
y_test.to_csv('./data/Gas Sensor Drift Dataset/complete/y_test.csv', index=False)


Unnamed: 0,feat_9,feat_65,feat_11,feat_12,feat_67,feat_14,feat_68,feat_73,feat_69,feat_1,...,feat_21,feat_104,feat_112,feat_34,feat_48,feat_32,feat_40,feat_29,feat_50,feat_58
0,15326.6914,13540.6738,2.269085,2.713374,2.006883,-2.488324,2.519022,13831.7539,6.261430,15596.1621,...,1.795029,-1.058061,-0.969336,1.682904,-0.791447,-1.728611,-0.781959,1.786704,3.605537,3.555169
1,23855.7812,20553.5645,4.901063,5.971392,4.266941,-4.199424,5.185937,22540.1933,6.398693,26402.0704,...,1.570895,-1.191404,-0.838039,1.933877,-0.718214,-1.334217,-0.748061,1.428631,3.990162,4.011519
2,37562.3008,32366.9336,7.386357,9.511703,6.419288,-6.689464,8.290210,35597.0020,9.709606,42103.5820,...,2.050157,-1.129027,-1.126488,2.316449,-1.008518,-1.844021,-1.011991,1.964727,4.998625,4.997810
3,38379.0664,33300.6055,10.840889,14.566782,9.397436,-6.878915,13.022406,36487.1211,29.111833,42825.9883,...,2.889819,-1.249903,-1.134804,2.415714,-1.172276,-2.027846,-1.005683,2.753598,5.118540,5.138118
4,51975.5899,45212.6523,10.409176,14.379885,9.083462,-9.543570,12.575800,49670.9785,13.600584,58151.1757,...,3.389052,-1.770461,-1.602759,2.893531,-1.270385,-2.424730,-1.628245,2.981684,6.673105,6.689090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,42203.8965,34124.2725,7.053800,9.632700,5.430900,-5.477300,6.965900,26287.4082,16.664800,28929.5108,...,3.597800,-3.520200,-3.138500,2.014500,-2.195100,-3.319400,-2.324400,3.377100,2.806200,2.771000
13906,40322.3164,43566.1407,11.091500,15.794300,11.933400,-8.960700,16.645200,34525.7149,32.584500,40392.1230,...,8.281100,-5.525000,-4.034500,2.879700,-3.258100,-8.134100,-3.295800,7.515900,5.056700,5.041800
13907,229854.3418,190859.5137,58.210700,83.843400,51.193800,-37.116000,75.222800,157875.0557,105.849600,183271.5898,...,18.090300,-8.716800,-8.157300,6.470100,-6.238800,-28.025400,-7.187800,18.515400,16.062900,15.720900
13908,28441.1289,31528.4550,7.502900,10.926800,8.090300,-6.692200,11.084200,24548.6875,23.492700,29059.8516,...,5.354100,-4.595000,-4.397600,2.297100,-2.459200,-5.674600,-3.389300,5.524400,3.514800,3.506400


In [7]:
missing_X = simulate_nan(X, missing_rate)['X']
display(missing_X)

X_train, X_test, y_train, y_test = train_test_split(missing_X, Y, test_size=0.15, random_state=1105)
X_train.to_csv('./data/Gas Sensor Drift Dataset/miss_25/X_train.csv', index=False)
X_test.to_csv('./data/Gas Sensor Drift Dataset/miss_25/X_test.csv', index=False)
y_train.to_csv('./data/Gas Sensor Drift Dataset/miss_25/y_train.csv', index=False)
y_test.to_csv('./data/Gas Sensor Drift Dataset/miss_25/y_test.csv', index=False)

Unnamed: 0,feat_9,feat_65,feat_11,feat_12,feat_67,feat_14,feat_68,feat_73,feat_69,feat_1,...,feat_21,feat_104,feat_112,feat_34,feat_48,feat_32,feat_40,feat_29,feat_50,feat_58
0,15326.6914,,2.269085,2.713374,2.006883,-2.488324,,13831.7539,6.261430,15596.1621,...,1.795029,-1.058061,,1.682904,-0.791447,-1.728611,-0.781959,1.786704,3.605537,
1,23855.7812,,4.901063,5.971392,4.266941,-4.199424,5.185937,22540.1933,6.398693,26402.0704,...,1.570895,-1.191404,-0.838039,1.933877,-0.718214,-1.334217,-0.748061,,,
2,,32366.9336,,,6.419288,-6.689464,,35597.0020,9.709606,42103.5820,...,2.050157,-1.129027,-1.126488,2.316449,-1.008518,-1.844021,-1.011991,1.964727,,4.997810
3,,33300.6055,,14.566782,,,13.022406,36487.1211,29.111833,,...,2.889819,,-1.134804,2.415714,-1.172276,-2.027846,,2.753598,5.118540,5.138118
4,,45212.6523,10.409176,14.379885,9.083462,,12.575800,49670.9785,13.600584,58151.1757,...,3.389052,-1.770461,-1.602759,2.893531,,,-1.628245,2.981684,6.673105,6.689090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,42203.8965,34124.2725,7.053800,9.632700,5.430900,,6.965900,26287.4082,16.664800,28929.5108,...,3.597800,,-3.138500,2.014500,,-3.319400,,3.377100,,2.771000
13906,40322.3164,43566.1407,11.091500,15.794300,11.933400,-8.960700,16.645200,34525.7149,,40392.1230,...,,-5.525000,-4.034500,2.879700,-3.258100,,-3.295800,7.515900,5.056700,5.041800
13907,229854.3418,,58.210700,83.843400,51.193800,-37.116000,75.222800,,105.849600,183271.5898,...,,-8.716800,-8.157300,6.470100,-6.238800,-28.025400,-7.187800,,16.062900,15.720900
13908,28441.1289,,7.502900,,,,11.084200,,23.492700,29059.8516,...,5.354100,-4.595000,,2.297100,-2.459200,-5.674600,-3.389300,5.524400,3.514800,3.506400


In [9]:
df2 = df[df.duplicated()]
print(df2)

Empty DataFrame
Columns: [feat_1, feat_2, feat_3, feat_4, feat_5, feat_6, feat_7, feat_8, feat_9, feat_10, feat_11, feat_12, feat_13, feat_14, feat_15, feat_16, feat_17, feat_18, feat_19, feat_20, feat_21, feat_22, feat_23, feat_24, feat_25, feat_26, feat_27, feat_28, feat_29, feat_30, feat_31, feat_32, feat_33, feat_34, feat_35, feat_36, feat_37, feat_38, feat_39, feat_40, feat_41, feat_42, feat_43, feat_44, feat_45, feat_46, feat_47, feat_48, feat_49, feat_50, feat_51, feat_52, feat_53, feat_54, feat_55, feat_56, feat_57, feat_58, feat_59, feat_60, feat_61, feat_62, feat_63, feat_64, feat_65, feat_66, feat_67, feat_68, feat_69, feat_70, feat_71, feat_72, feat_73, feat_74, feat_75, feat_76, feat_77, feat_78, feat_79, feat_80, feat_81, feat_82, feat_83, feat_84, feat_85, feat_86, feat_87, feat_88, feat_89, feat_90, feat_91, feat_92, feat_93, feat_94, feat_95, feat_96, feat_97, feat_98, feat_99, feat_100, ...]
Index: []

[0 rows x 130 columns]
