In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
data_directories = sorted([
    "../data/" + direct + "/" 
    for direct in os.listdir("../data/") 
    if direct != ".DS_Store"
])
data_files_paths = sorted([x+y for x in data_directories for y in os.listdir(x) if ".data" in y])
print("Data files paths:\n", data_files_paths)
print("")
name_files_paths = sorted([x+y for x in data_directories for y in os.listdir(x) if ".name" in y])
print("Names files paths: \n", name_files_paths)

Data files paths:
 ['../data/1. Abalone/abalone.data', '../data/3. Housing/housing.data', '../data/5. Glass/glass.data', '../data/6. Auto_MPG/auto-mpg.data', '../data/6. Auto_MPG/auto-mpg.data-original', '../data/7. WPBC/breast-cancer-wisconsin.data', '../data/7. WPBC/wdbc.data', '../data/7. WPBC/wpbc.data', '../data/8. Anneal/anneal.data', '../data/9. Adult/adult.data']

Names files paths: 
 ['../data/1. Abalone/abalone.names', '../data/3. Housing/housing.names', '../data/4. Forest_Fires/forestfires.names', '../data/5. Glass/glass.names', '../data/6. Auto_MPG/auto-mpg.names', '../data/7. WPBC/breast-cancer-wisconsin.names', '../data/7. WPBC/wdbc.names', '../data/7. WPBC/wpbc.names', '../data/8. Anneal/anneal.names', '../data/8. Anneal/old.adult.names', '../data/9. Adult/adult.names']


In [3]:
data_directories

['../data/1. Abalone/',
 '../data/2. Concrete/',
 '../data/3. Housing/',
 '../data/4. Forest_Fires/',
 '../data/5. Glass/',
 '../data/6. Auto_MPG/',
 '../data/7. WPBC/',
 '../data/8. Anneal/',
 '../data/9. Adult/']

# Abalone (Multiclass Classification, but not great, K=28)

In [27]:
abalone = pd.read_csv(data_files_paths[0], header=None)
abalone.columns = ["Sex", "Length", "Diameter", "Height", "Whole_weight", 
         "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"]
X_abalone = abalone.iloc[:, :-1]
y_abalone = abalone.iloc[:,-1]
X_train_abalone, X_test_abalone, y_train_abalone, y_test_abalone = train_test_split(
    X_abalone, y_abalone, test_size=0.1
)
abalone_train = pd.concat([X_train_abalone, y_train_abalone], axis=1).reset_index(drop=True)
abalone_train.to_csv(data_directories[1]+"abalone_train.csv", index=False)
abalone_test = pd.concat([X_test_abalone, y_test_abalone], axis=1).reset_index(drop=True)
abalone_test.to_csv(data_directories[1]+"abalone_test.csv", index=False)

# Concrete (Regression - Normalized)

In [7]:
concrete = pd.read_excel("../data/2. Concrete/Concrete_Data.xls")
X_concrete = concrete.iloc[:, :-1]
y_concrete = concrete.iloc[:,-1]
X_train_concrete, X_test_concrete, y_train_concrete, y_test_concrete = train_test_split(
    X_concrete, y_concrete, test_size=0.1
)
concrete_train = pd.concat([X_train_concrete, y_train_concrete], axis=1).reset_index(drop=True)
concrete_test = pd.concat([X_test_concrete, y_test_concrete], axis=1).reset_index(drop=True)

# Normalization
ss_train = StandardScaler()
ss_train.fit(concrete_train)
concrete_train = pd.DataFrame(ss_train.transform(concrete_train))
concrete_test = pd.DataFrame(ss_train.transform(concrete_test))

concrete_train.to_csv(data_directories[1]+"concrete_train_3.csv", index=False)
concrete_test.to_csv(data_directories[1]+"concrete_test_3.csv", index=False)

# Housing (Regression - Normalized)

In [3]:
housing = pd.read_csv(data_files_paths[1], header=None)
housing["values"] = housing[0].apply(lambda x: [float(y) for y in x.split(" ") if y!=""])
for i in range(1, 15):
    housing[f"column_{i}"] = housing["values"].apply(lambda x: x[i-1])

housing.drop([0, "values"], axis=1, inplace=True)
housing.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", 
                   "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
X_housing = housing.iloc[:, :-1]
y_housing = housing.iloc[:,-1]
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(
    X_housing, y_housing, test_size=0.3
)
housing_train = pd.concat([X_train_housing, y_train_housing], axis=1).reset_index(drop=True)
housing_test = pd.concat([X_test_housing, y_test_housing], axis=1).reset_index(drop=True)
# Normalization
ss_train = StandardScaler()
ss_train.fit(housing_train)
housing_train = pd.DataFrame(ss_train.transform(housing_train))
housing_test = pd.DataFrame(ss_train.transform(housing_test))

housing_train.to_csv(data_directories[2]+"housing_train.csv", index=False)
housing_test.to_csv(data_directories[2]+"housing_test.csv", index=False)

# Forest Fires (Regression)

In [5]:
forest_fires_train

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,sep,sat,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.00
1,4,5,mar,thu,91.4,30.7,74.3,7.5,18.2,29,3.1,0.0,0.00
2,2,4,aug,thu,91.6,248.4,753.8,6.3,16.6,59,2.7,0.0,0.00
3,4,5,sep,sat,92.5,88.0,698.6,7.1,20.3,45,3.1,0.0,0.00
4,4,4,aug,thu,95.8,152.0,624.1,13.8,32.4,21,4.5,0.0,0.00
5,6,3,apr,wed,88.0,17.2,43.5,3.8,15.2,51,2.7,0.0,0.00
6,3,4,sep,fri,94.3,85.1,692.3,15.9,19.8,50,5.4,0.0,0.00
7,8,6,jun,wed,91.2,147.8,377.2,12.7,19.6,43,4.9,0.0,0.00
8,7,4,sep,mon,91.6,108.4,764.0,6.2,19.3,44,2.2,0.0,0.00
9,1,4,sep,sat,92.2,102.3,751.5,8.4,24.2,27,3.1,0.0,0.00


In [4]:
# Some kind of stratified sampling in order to get similarly balanced datasets (between 0s and other values)
forest_fires = pd.read_csv(data_directories[3]+"forestfires.csv")
X_forest_fires_0 = forest_fires[forest_fires.area == 0].iloc[:, :-1]
y_forest_fires_0 = forest_fires[forest_fires.area == 0].iloc[:, -1]
X_forest_fires_other = forest_fires[forest_fires.area != 0].iloc[:, :-1]
y_forest_fires_other = forest_fires[forest_fires.area != 0].iloc[:, -1]

X_train_forest_fires_0, X_test_forest_fires_0, y_train_forest_fires_0, y_test_forest_fires_0 = train_test_split(
    X_forest_fires_0, y_forest_fires_0, test_size=0.1,
)
X_train_forest_fires_other, X_test_forest_fires_other, y_train_forest_fires_other, y_test_forest_fires_other = train_test_split(
    X_forest_fires_other, y_forest_fires_other, test_size=0.1,
)
X_train_forest_fires = pd.concat([X_train_forest_fires_0, X_train_forest_fires_other], axis=0).reset_index(drop=True)
X_test_forest_fires = pd.concat([X_test_forest_fires_0, X_test_forest_fires_other], axis=0).reset_index(drop=True)
y_train_forest_fires = pd.concat([y_train_forest_fires_0, y_train_forest_fires_other], axis=0).reset_index(drop=True)
y_test_forest_fires = pd.concat([y_test_forest_fires_0, y_test_forest_fires_other], axis=0).reset_index(drop=True)

forest_fires_train = pd.concat([X_train_forest_fires, y_train_forest_fires], axis=1).reset_index(drop=True)
forest_fires_test = pd.concat([X_test_forest_fires, y_test_forest_fires], axis=1).reset_index(drop=True)

# Normalization
ss_train = StandardScaler()
ss_train.fit(forest_fires_train)
forest_fires_train = pd.DataFrame(ss_train.transform(forest_fires_train))
forest_fires_test = pd.DataFrame(ss_train.transform(forest_fires_test))

forest_fires_train.to_csv(data_directories[3]+"forest_fires_train.csv", index=False)
forest_fires_test.to_csv(data_directories[3]+"forest_fires_test.csv", index=False)

ValueError: could not convert string to float: 'sep'

# Glass (Multiclass Classification, K=6)

In [162]:
glass = pd.read_csv(data_files_paths[2], header=None)
glass.columns = ["ID", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "type"]
X_glass = glass.iloc[:, 1:-1] # We don't take the ID column (useless)
y_glass = glass.iloc[:,-1]
y_glass = y_glass.map({1:1, 2:2, 3:3, 5:4, 6:5, 7:6})
X_train_glass, X_test_glass, y_train_glass, y_test_glass = train_test_split(
    X_glass, y_glass, test_size=0.1, stratify=y_glass
)
glass_train = pd.concat([X_train_glass, y_train_glass], axis=1).reset_index(drop=True)
glass_train.to_csv("../data/5. Glass/"+"glass_train.csv", index=False)
glass_test = pd.concat([X_test_glass, y_test_glass], axis=1).reset_index(drop=True)
glass_test.to_csv("../data/5. Glass/"+"glass_test.csv", index=False)

# Auto MPG (Regression-Normalized)

In [14]:
auto_mpg = pd.read_csv(data_files_paths[4], header=None, sep=",")
auto_mpg.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", 
                    "acceleration", "model_year", "origin", "car_name"]
print(len(auto_mpg))
auto_mpg = auto_mpg.dropna() # Loosing 14 rows out of 406 -> 392 samples !
print(len(auto_mpg))
indices_many = auto_mpg.groupby("mpg")["car_name"].count().sort_values().iloc[116:126].index.values
X_auto_mpg_0_many = auto_mpg[auto_mpg.mpg.isin(indices_many)].iloc[:, 1:]
y_auto_mpg_0_many = auto_mpg[auto_mpg.mpg.isin(indices_many)].iloc[:, 0]
X_auto_mpg_few = auto_mpg[~auto_mpg.mpg.isin(indices_many)].iloc[:, 1:]
y_auto_mpg_few = auto_mpg[~auto_mpg.mpg.isin(indices_many)].iloc[:, 0]

X_train_auto_mpg_0_many, X_test_auto_mpg_0_many, y_train_auto_mpg_0_many, y_test_auto_mpg_0_many = train_test_split(
    X_auto_mpg_0_many, y_auto_mpg_0_many, test_size=0.1,
)
X_train_auto_mpg_few, X_test_auto_mpg_few, y_train_auto_mpg_few, y_test_auto_mpg_few = train_test_split(
    X_auto_mpg_few, y_auto_mpg_few, test_size=0.1,
)
X_train_auto_mpg = pd.concat([X_train_auto_mpg_0_many, X_train_auto_mpg_few], axis=0).reset_index(drop=True)
X_test_auto_mpg = pd.concat([X_test_auto_mpg_0_many, X_test_auto_mpg_few], axis=0).reset_index(drop=True)
y_train_auto_mpg = pd.concat([y_train_auto_mpg_0_many, y_train_auto_mpg_few], axis=0).reset_index(drop=True)
y_test_auto_mpg = pd.concat([y_test_auto_mpg_0_many, y_test_auto_mpg_few], axis=0).reset_index(drop=True)

auto_mpg_train = pd.concat([X_train_auto_mpg, y_train_auto_mpg], axis=1).drop("car_name", axis=1).reset_index(drop=True)
auto_mpg_test = pd.concat([X_test_auto_mpg, y_test_auto_mpg], axis=1).drop("car_name", axis=1).reset_index(drop=True)
ss_train = StandardScaler()
ss_train.fit(auto_mpg_train)
auto_mpg_train = pd.DataFrame(ss_train.transform(auto_mpg_train))
auto_mpg_test = pd.DataFrame(ss_train.transform(auto_mpg_test))


auto_mpg_train.to_csv(data_directories[5]+"auto_mpg_train_3.csv", index=False)
auto_mpg_test.to_csv(data_directories[5]+"auto_mpg_test_3.csv", index=False)

406
392


In [None]:
len(auto_mpg_train)+len(auto_mpg_test)

# WPBC (Binary Classification)
Dropping 4 lines with NAN

In [160]:
wpbc = pd.read_csv(data_files_paths[7], header=None)
wpbc["output"] = wpbc[1].map({"N":-1, "R":1})
wpbc.drop([0, 1], axis=1, inplace=True) # Dropping the ID and the duplicate of the new column "label"
wpbc.columns = [f"attribute_{i}" for i in range(1, 34)]+["output"]
wpbc.replace("?", np.nan, inplace=True)
wpbc.dropna(inplace=True)
for col in wpbc.columns:
    if col != "output":
        wpbc[col] = wpbc[col].astype(float)
X_wpbc = wpbc.iloc[:, :-1]
y_wpbc = wpbc.iloc[:,-1]
X_train_wpbc, X_test_wpbc, y_train_wpbc, y_test_wpbc = train_test_split(
    X_wpbc, y_wpbc, test_size=0.1, stratify=y_wpbc
)
wpbc_train = pd.concat([X_train_wpbc, y_train_wpbc], axis=1).reset_index(drop=True)
wpbc_train.to_csv(data_directories[6]+"wpbc_train.csv", index=False)
wpbc_test = pd.concat([X_test_wpbc, y_test_wpbc], axis=1).reset_index(drop=True)
wpbc_test.to_csv(data_directories[6]+"wpbc_test.csv", index=False)

# Adult (Binary Classification)

In [7]:
def delete_first_space(x):
    if x[0] == " ":
        return x[1:]
    else:
        return x

In [12]:
adult_train = pd.read_csv(data_files_paths[-1], header=None)
adult_test = pd.read_csv("../data/9. Adult/adult.test", header=None)
adult_train.columns = ["age", "workclass", "fnwlgt", "education", 
                 "education_num", "marital_status", "occupation", 
                "relationship", "race", "sex", "capital_gain", 
                 "capital_loss", "hours_per_week", "native_country", "label"]
adult_test.columns = ["age", "workclass", "fnwlgt", "education", 
                 "education_num", "marital_status", "occupation", 
                "relationship", "race", "sex", "capital_gain", 
                 "capital_loss", "hours_per_week", "native_country", "label"]

#Dropping education bc education_num is the same but with numeric values
adult_train.drop("education", inplace=True, axis=1)
adult_test.drop("education", inplace=True, axis=1)

# Mapping " ?" to "Unknown in relevant columns"
adult_train["workclass"] = adult_train["workclass"].replace({" ?": "Unknown"})
adult_test["workclass"] = adult_test["workclass"].replace({" ?": "Unknown"})

adult_train["occupation"] = adult_train["occupation"].replace({" ?": "Unknown"})
adult_test["occupation"] = adult_test["occupation"].replace({" ?": "Unknown"})

adult_train["native_country"] = adult_train["native_country"].replace({" ?": "Unknown"})
adult_test["native_country"] = adult_test["native_country"].replace({" ?": "Unknown"})

# Mapping label values to -1, 1
adult_train["label"] = adult_train["label"].map({" <=50K": -1, " >50K": 1}).astype(int)
adult_test["label"] = adult_test["label"].map({" <=50K": -1, " >50K": 1}).astype(int)

# Mapping sex values to 0, 1
adult_train["sex"] = adult_train["sex"].map({" Male": 0, " Female": 1}).astype(int)
adult_test["sex"] = adult_test["sex"].map({" Male": 0, " Female": 1}).astype(int)

# Preprocessing strings to delete the space at the beginning of every string
for col in ["workclass", "marital_status", "occupation", "relationship", "race", "native_country"]:
    adult_train[col] = adult_train[col].apply(lambda x: delete_first_space(x))
    adult_test[col] = adult_test[col].apply(lambda x: delete_first_space(x))

# OneHotEncoding categorical features
adult_train = pd.get_dummies(adult_train)
adult_test = pd.get_dummies(adult_test)
# Adding a missing column (value not in test set) after One Hot Encoding test set
adult_test['native_country_Holand-Netherlands'] = 0

# Separating X and Y
X_train_adult = adult_train.loc[:, adult_train.columns != "label"]
ss_train = StandardScaler()
ss_train.fit(X_train_adult)
X_train_adult = ss_train.transform(X_train_adult)
X_test_adult = adult_test.loc[:, adult_test.columns != "label"]
X_test_adult = ss_train.transform(X_test_adult)
y_train_adult = adult_train.loc[:, "label"]
y_test_adult = adult_test.loc[:, "label"]

# Saving files
adult_train_final = pd.concat(
    [pd.DataFrame(X_train_adult), y_train_adult], 
    axis=1
).reset_index(drop=True)
adult_train_final.to_csv("../data/9. Adult/adult_train.csv", index=False)
adult_test_final = pd.concat(
    [pd.DataFrame(X_test_adult), y_test_adult],
    axis=1
).reset_index(drop=True)
adult_test_final.to_csv("../data/9. Adult/adult_test.csv", index=False)