## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import openml
from scipy.io import arff
from io import StringIO

In [2]:
def correlated_columns(X, index=0.8):

    correlation_matrix = X.corr()
    correlation_matrix = correlation_matrix.abs()

    upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > index)]

    return to_drop

### League of Legends Diamond Games (First 15 Minutes) - large
https://www.kaggle.com/datasets/benfattori/league-of-legends-diamond-games-first-15-minutes

In [3]:
lol_data = pd.read_csv('data_raw/MatchTimelinesFirst15.csv', index_col=0)
lol_data = lol_data.drop(columns=['matchId','blueDragonKills','redDragonKills'])
lol_data = lol_data.drop_duplicates()
y = lol_data['blue_win']
X = lol_data.drop(columns=['blue_win'])
X = X.drop(correlated_columns(X), axis=1)
lol_data = pd.concat([X, y], axis=1)

In [4]:
lol_data.to_csv('data_clean/lol_data.csv', index=False)

### Heart Attack Analysis & Prediction Dataset - large
https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset

In [5]:
heart_data = pd.read_csv('data_raw/heart.csv')
y = heart_data['output']
X = heart_data.drop(columns=['output'])
X = X.drop(correlated_columns(X), axis=1)
heart_data = pd.concat([X, y], axis=1)

In [6]:
heart_data.to_csv('data_clean/heart_data.csv', index=False)

### Pollen - small
https://www.openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=pollen&id=871

In [7]:
pollen = openml.datasets.get_dataset(871)
X, y, _, _ = pollen.get_data(target=pollen.default_target_attribute)
y = np.where(y == 'P', 1, 0)
pollen_data = pd.DataFrame(X)
pollen_data = pollen_data.drop(correlated_columns(pollen_data), axis=1)
pollen_data['target'] = y

  pollen = openml.datasets.get_dataset(871)


In [8]:
pollen_data.to_csv('data_clean/pollen_data.csv', index=False)

### Phishing websites - large
[link to data](https://www.openml.org/search?type=data&sort=runs&status=active&qualities.NumberOfClasses=%3D_2&qualities.NumberOfFeatures=between_10_100&id=4534)

In [9]:
phishing_arff = arff.loadarff('data_raw/phishing_websites.arff')
phishing = pd.DataFrame(phishing_arff[0])
for col in phishing:
    phishing[col] = phishing[col].apply(lambda x: int(x.decode()))
phishing['Result'] = phishing['Result'].apply(lambda x: 0 if x==-1 else 1)
phishing = phishing.drop(correlated_columns(phishing), axis=1)
phishing.to_csv('data_clean/phishing_websites.csv', index=False)

### Climate model simulation - large
[link to data](https://www.openml.org/search?type=data&sort=runs&status=active&qualities.NumberOfClasses=%3D_2&qualities.NumberOfFeatures=between_10_100&id=40994)

In [10]:
climate_model_arff = arff.loadarff('data_raw/climate_model_simulation.arff')
climate_model = pd.DataFrame(climate_model_arff[0])
climate_model['outcome'] = climate_model['outcome'].apply(lambda x: int(x.decode()))
climate_model = climate_model.drop(correlated_columns(climate_model), axis=1)
climate_model.to_csv('data_clean/climate_model_simulation.csv')

### Banknote authentication - small
[link to data](https://www.openml.org/search?type=data&sort=runs&status=active&qualities.NumberOfClasses=%3D_2&qualities.NumberOfFeatures=lte_10&qualities.NumberOfInstances=between_1000_10000&id=1462)

In [11]:
banknote_file_path = 'data_raw/banknote_authentication.arff'
with open(banknote_file_path, 'r', encoding='utf-8') as f:
    arff_data = f.read()
banknote_arff = arff.loadarff(StringIO(arff_data))
banknote = pd.DataFrame(banknote_arff[0])
banknote['Class'] = banknote['Class'].apply(lambda x: int(x.decode())-1)
print(len(list(banknote.columns)))
banknote = banknote.drop(correlated_columns(banknote), axis=1)
print(len(list(banknote.columns)))
banknote.to_csv('data_clean/banknote_authentication.csv')

5
5


### Spambase - large
https://archive.ics.uci.edu/dataset/94/spambase

In [12]:
spambase_data = pd.read_csv('data_raw/spambase.data', header=None)
X = spambase_data.iloc[:, :-1]
y = spambase_data.iloc[:, -1]
X = X.drop(correlated_columns(X), axis=1)
spambase_data = pd.concat([X, y], axis=1)
spambase_data.columns = list(range(len(spambase_data.columns) - 1)) + ['target']
spambase_data.to_csv('data_clean/spambase.csv', index=False)

### Ionosphere - large
https://archive.ics.uci.edu/dataset/52/ionosphere

In [13]:
ion_data = pd.read_csv('data_raw/ionosphere.data', header=None)
X = ion_data.iloc[:, :-1]
X = X.iloc[:, 2:]
X = X.drop(correlated_columns(X), axis=1)
y = (ion_data.iloc[:, -1] == 'g').astype('int')
ion_data = pd.concat([X, y], axis=1)
ion_data.columns = list(range(len(ion_data.columns) - 1)) + ['target']
ion_data.to_csv('data_clean/ionosphere.csv', index=False)

### Phoneme - small
https://www.openml.org/search?type=data&sort=nr_of_downloads&status=active&qualities.NumberOfClasses=%3D_2&format=any&id=1489

In [14]:
raw_data = arff.loadarff('data_raw/phoneme.arff')
df = pd.DataFrame(raw_data[0])
df['Class'] = df['Class'].astype('int') - 1
df.to_csv('data_clean/phoneme.csv', index=False)