In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [14]:
# Electricity
elec = pd.read_csv('./data/raw/electricity-normalized.csv')
elec['class'] = pd.factorize(elec['class'])[0]  # factorize target
elec.iloc[:, :-1] = MinMaxScaler().fit_transform(elec.iloc[:, :-1])  # normalize
elec.to_csv('./data/electricity.csv', index=None)

In [34]:
# Airlines
air = pd.read_csv('./data/raw/airlines.csv')
air['Airline'] = pd.factorize(air['Airline'])[0]  # factorize airline

# Factorize/normalize airports in one go
airports = np.append(air['AirportFrom'], air['AirportTo'])
airports = pd.factorize(airports)[0]
airports = MinMaxScaler().fit_transform(airports.reshape(-1, 1))  # normalize
air['AirportFrom'] = airports[:air.shape[0]]  # reassign attributes
air['AirportTo'] = airports[air.shape[0]:]

# Normalize remaining attributes
air[['Airline', 'Flight', 'DayOfWeek', 'Time', 'Length']] = MinMaxScaler().fit_transform(air[['Airline', 'Flight', 'DayOfWeek', 'Time', 'Length']])

air.to_csv('./data/airlines.csv', index=None)

In [57]:
# Bank Marketing
bank = pd.read_csv('./data/raw/bank_marketing.csv')

num_attr = ['V1','V6','V10','V12','V13','V14','V15']
cat = bank.drop(num_attr, axis=1)
for i in cat.columns:  # factorize categorical variables
    cat[i] = pd.factorize(cat[i])[0]
bank[cat.columns] = cat

bank.iloc[:, :-1] = MinMaxScaler().fit_transform(bank.iloc[:, :-1])  # normalize
bank.to_csv('./data/bank_marketing.csv', index=None)


In [85]:
# IQ
iq = pd.read_csv('./data/raw/iq_data.csv')
iq = iq.sort_values(['task_id'])  # sort by task
iq['cft_task'] = iq['cft_task'].astype(int)  # make target int
iq.iloc[:, :-1] = MinMaxScaler().fit_transform(iq.iloc[:, :-1])
iq.to_csv('./data/iq.csv', index=None)

(15762, 77)

In [98]:
# Poker Hand
poker = pd.read_csv('./data/raw/poker_hand.csv')
poker['Class'] = poker['Class'] - 1  # let classes begin with index 0
poker.iloc[:, :-1] = MinMaxScaler().fit_transform(poker.iloc[:, :-1]) # normalize
poker.to_csv('./data/poker.csv', index=None)

In [15]:
# KDD
kdd = pd.read_csv('./data/raw/KDDCup99.csv')

cat_attr = kdd.select_dtypes(include=['object']).columns
for i in cat_attr:
    kdd[i] = pd.factorize(kdd[i])[0]  # factorize categorical variables

kdd.iloc[:, :-1] = MinMaxScaler().fit_transform(kdd.iloc[:, :-1]) # normalize
kdd = kdd.sample(frac=1)  # shuffle data
kdd.to_csv('./data/kdd.csv', index=None)

In [25]:
# Covertype
cover = pd.read_csv('./data/raw/covertype.csv')
cover['class'] = cover['class'] - 1  # let classes begin with index 0
cover.iloc[:, :-1] = MinMaxScaler().fit_transform(cover.iloc[:, :-1]) # normalize
cover.to_csv('./data/covertype.csv', index=None)

In [15]:
# Gas Sensor Drift
gas = pd.read_csv('./data/raw/gas_sensor_drift.csv')
gas['Class'] = gas['Class'] - 1  # let classes begin with index 0
gas.iloc[:, :-1] = MinMaxScaler().fit_transform(gas.iloc[:, :-1]) # normalize
gas.to_csv('./data/gas.csv', index=None)

In [33]:
# Insects Abrupt (Unbalanced)
insects_abrupt = pd.read_csv('./data/raw/insects_abrupt_imbalanced.csv', header=None, sep=';')
insects_abrupt.iloc[:,-1] = pd.factorize(insects_abrupt.iloc[:,-1])[0]  # factorize target
insects_abrupt.to_csv('./data/insects_abrupt.csv', index=None)

In [8]:
insects_inc = pd.read_csv('./data/raw/insects_incremental_imbalanced.csv', header=None, sep=';')
insects_inc.iloc[:,-1].value_counts()

cx-quinq-male           134717
ae-aegypti-female       125354
ae-aegypti-male          83794
cx-quinq-female          64895
ae-albopictus-female     29953
ae-albopictus-male       13331
Name: 33, dtype: int64

In [34]:
# Insects Incremental (Unbalanced)
insects_inc = pd.read_csv('./data/raw/insects_incremental_imbalanced.csv', header=None, sep=';')
insects_inc.iloc[:,-1] = pd.factorize(insects_inc.iloc[:,-1])[0]  # factorize target
insects_inc.to_csv('./data/insects_inc.csv', index=None)