In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
import sys
import os
# print(f'{os.path.dirname(os.getcwd())}\Imputers')
sys.path.append(f'{os.path.dirname(os.getcwd())}')
from Imputers.utils import simulate_nan

In [10]:
## PARAMS
n_feat = 128
missing_rate = 0.75
## ===========================

row_count = 0
df = pd.read_csv('./data/Gas Sensor Drift Dataset/bat/batch1.dat', delimiter=' ', header=None)
for i in range(2,11):
    df2 = pd.read_csv(f'./data/Gas Sensor Drift Dataset/bat/batch{i}.dat',delimiter=' ', header=None)
    df = pd.concat([df, df2], axis=0, ignore_index=True)

# split label
df[['Gas Class', 'Gas Num']] = df[0].str.split(';', expand=True).astype('float')

# clean each feature
for i in df.columns:
    if i in ['Gas Class', 'Gas Num', 129,0]:
        continue
    else:
        df[i] = df[i].str.split(':').str.get(1).astype('float')
        df = df.rename(columns={i: f'feat_{i}'})

# drop empty column
df.drop([0,129], axis=1, inplace=True)

# transform label into name
replace_mapping = {
    1.0 : '1',
    2.0 : '2',
    3.0 : '3',
    4.0 : '4',
    5.0 : '5',
    6.0 : '6',
}
df['Gas Class'].replace(replace_mapping, inplace=True)
display(df)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_121,feat_122,feat_123,feat_124,feat_125,feat_126,feat_127,feat_128,Gas Class,Gas Num
0,15596.1621,1.868245,2.371604,2.803678,7.512213,-2.739388,-3.344671,-4.847512,15326.6914,1.768526,...,3037.0390,3.972203,0.527291,0.728443,1.445783,-0.545079,-0.902241,-2.654529,1,10.0
1,26402.0704,2.532401,5.411209,6.509906,7.658469,-4.722217,-5.817651,-7.518333,23855.7812,2.164706,...,4176.4453,4.281373,0.980205,1.628050,1.951172,-0.889333,-1.323505,-1.749225,1,20.0
2,42103.5820,3.454189,8.198175,10.508439,11.611003,-7.668313,-9.478675,-12.230939,37562.3008,2.840403,...,5914.6685,5.396827,1.403973,2.476956,3.039841,-1.334558,-1.993659,-2.348370,1,30.0
3,42825.9883,3.451192,12.113940,16.266853,39.910056,-7.849409,-9.689894,-11.921704,38379.0664,2.851173,...,6147.4744,5.501071,1.981933,3.569823,4.049197,-1.432205,-2.146158,-2.488957,1,40.0
4,58151.1757,4.194839,11.455096,15.715298,17.654915,-11.083364,-13.580692,-16.407848,51975.5899,3.480866,...,8158.6449,7.174334,1.993808,3.829303,4.402448,-1.930107,-2.931265,-4.088756,1,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,28929.5108,3.309200,4.393300,5.057000,9.423300,-3.879400,-5.415100,-15.933500,42203.8965,3.377800,...,5480.6582,2.851000,1.046500,2.039900,5.341900,-0.842830,-1.407700,-4.447900,5,50.0
13906,40392.1230,3.280000,10.268900,13.585900,20.979000,-8.173200,-10.859100,-31.802700,40322.3164,2.594000,...,15700.0969,5.216200,5.167500,8.995000,11.591700,-4.495400,-7.655300,-12.767100,4,250.0
13907,183271.5898,15.594500,45.480300,63.877500,81.139600,-29.539100,-47.626600,-329.266900,229854.3418,14.519400,...,50936.2070,17.808000,17.288700,26.789500,33.366100,-15.823700,-28.516200,-56.332800,5,600.0
13908,29059.8516,2.568000,6.714200,8.786500,13.987800,-6.041100,-8.070700,-22.322300,28441.1289,2.069400,...,10079.5098,3.539900,3.266000,6.249900,9.026200,-2.849500,-4.734300,-9.573100,4,150.0


In [11]:
# Check whether each class has correct amounts of data.
print(df['Gas Class'].value_counts())


Gas Class
5    3009
2    2926
1    2565
4    1936
6    1833
3    1641
Name: count, dtype: int64


In [12]:
# grouped_df = df[['Gas Class','Gas Num']].groupby('Gas Class')
# display(grouped_df.max())
# display(grouped_df.min())
# display(df[df['Gas Num']==600][df['Gas Class']==1])
# display(df['Gas Num'][650:700])

# no nan values
print(df.isna().sum().sum())

0


In [13]:
## feature selection
X = df[df.columns].drop('Gas Class',axis=1)
Y = df['Gas Class']
# fit = SelectKBest(score_func=f_classif, k='all').fit(X,Y)
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X.columns)

# ## generate board 
# featureScores = pd.concat([dfcolumns,dfscores],axis=1)
# featureScores.columns = ['Feature','Score']  #title
# display(featureScores.nlargest(n_feat,'Score'))  #see the highest 30 and select the best combanition

# selected_cols = featureScores.nlargest(n_feat,'Score').reset_index(drop=True)['Feature'].values

In [14]:
# X = df[selected_cols]
# display(X)
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1105)
# X_train.to_csv('./data/Gas Sensor Drift Dataset/complete/X_train.csv', index=False)
# X_test.to_csv('./data/Gas Sensor Drift Dataset/complete/X_test.csv', index=False)
# y_train.to_csv('./data/Gas Sensor Drift Dataset/complete/y_train.csv', index=False)
# y_test.to_csv('./data/Gas Sensor Drift Dataset/complete/y_test.csv', index=False)


In [15]:
missing_X = simulate_nan(X, missing_rate)['X']
display(missing_X)

X_train, X_test, y_train, y_test = train_test_split(missing_X, Y, test_size=0.15, random_state=1105)
X_train.to_csv(f'./data/Gas Sensor Drift Dataset/miss_{int(missing_rate*100)}/X_train.csv', index=False)
X_test.to_csv(f'./data/Gas Sensor Drift Dataset/miss_{int(missing_rate*100)}/X_test.csv', index=False)
y_train.to_csv(f'./data/Gas Sensor Drift Dataset/miss_{int(missing_rate*100)}/y_train.csv', index=False)
y_test.to_csv(f'./data/Gas Sensor Drift Dataset/miss_{int(missing_rate*100)}/y_test.csv', index=False)

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_120,feat_121,feat_122,feat_123,feat_124,feat_125,feat_126,feat_127,feat_128,Gas Num
0,,,,2.803678,7.512213,,-3.344671,,,,...,,,3.972203,,,,-0.545079,,,
1,,,5.411209,,,,,,,2.164706,...,-1.994993,4176.4453,,,1.62805,,-0.889333,,,20.0
2,,,8.198175,,,,-9.478675,-12.230939,37562.3008,,...,-2.867291,,,,,,-1.334558,,,
3,42825.9883,3.451192,,,,-7.849409,,,,,...,-3.058086,6147.4744,,,,,-1.432205,,-2.488957,
4,58151.1757,,,,17.654915,-11.083364,,,,,...,-4.181920,,,,,4.402448,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13905,,,,,9.423300,,,,42203.8965,,...,,,2.851000,,,,,-1.4077,-4.447900,50.0
13906,,,,13.585900,,,,,40322.3164,,...,,,5.216200,,8.99500,,-4.495400,-7.6553,,
13907,183271.5898,,,,,,,,,,...,,,,,,,,,-56.332800,600.0
13908,,,6.714200,8.786500,13.987800,-6.041100,,,,,...,-10.409100,,,3.266,,,-2.849500,,,150.0


In [16]:
df2 = df[df.duplicated()]
print(df2)

Empty DataFrame
Columns: [feat_1, feat_2, feat_3, feat_4, feat_5, feat_6, feat_7, feat_8, feat_9, feat_10, feat_11, feat_12, feat_13, feat_14, feat_15, feat_16, feat_17, feat_18, feat_19, feat_20, feat_21, feat_22, feat_23, feat_24, feat_25, feat_26, feat_27, feat_28, feat_29, feat_30, feat_31, feat_32, feat_33, feat_34, feat_35, feat_36, feat_37, feat_38, feat_39, feat_40, feat_41, feat_42, feat_43, feat_44, feat_45, feat_46, feat_47, feat_48, feat_49, feat_50, feat_51, feat_52, feat_53, feat_54, feat_55, feat_56, feat_57, feat_58, feat_59, feat_60, feat_61, feat_62, feat_63, feat_64, feat_65, feat_66, feat_67, feat_68, feat_69, feat_70, feat_71, feat_72, feat_73, feat_74, feat_75, feat_76, feat_77, feat_78, feat_79, feat_80, feat_81, feat_82, feat_83, feat_84, feat_85, feat_86, feat_87, feat_88, feat_89, feat_90, feat_91, feat_92, feat_93, feat_94, feat_95, feat_96, feat_97, feat_98, feat_99, feat_100, ...]
Index: []

[0 rows x 130 columns]
