In [1]:
import pandas as pd
import json
import os

In [2]:
with open("data/columns_feature_names.json", "r") as file:
    column_features_dict = json.load(file)

In [3]:
column_features_dict

{'color': 'white wine, yellow, very pale, pale, pale gold, gold, old gold, full gold, bronze, pale amber, amber, full amber, red, fino sherry',
 'nose': 'aromatic, peaty, sweet, light, fresh, dry, fruity, grassy, salty, sherry, spicy, rich',
 'body': 'soft, medium, full, round, smooth, light, firm, oily',
 'palate': 'full, dry, sherry, big, light, smooth, clean, fruity, grassy, smoky, sweet, spicy, oily, salty, aromatic',
 'finish': 'full, dry, warm, big, light, smooth, clean, fruity, grassy, smoky, sweet, spicy, oily, salty, aromatic, quick, long, very long, lingering'}

In [4]:
def load_data(file, data_name):
    features_txt = column_features_dict[data_name]
    features_list = features_txt.split(", ")
    features_list_clean = [feature.replace(" ", "-") for feature in features_list]
    columns = [data_name + "_" +  feature for feature in features_list_clean]
    
    df = pd.read_csv(file, skiprows=1, sep="\s+", header=None)
    df.columns = columns
    
    return df

In [5]:
df_list = []
file_format = ".TXT"
file_folder = "data"

for data_name in  column_features_dict.keys():
    print("Carregando os dados de " + data_name)
    file = data_name.upper() + file_format
    file_path = os.path.join(file_folder, file)
    df = load_data(file_path, data_name)
    row, col = df.shape
    print("Foram carregados:")
    print("\t observações: " + str(row))
    print("\t atributos: " + str(col))
    print()
    df_list.append(df)

Carregando os dados de color
Foram carregados:
	 observações: 109
	 atributos: 14

Carregando os dados de nose
Foram carregados:
	 observações: 109
	 atributos: 12

Carregando os dados de body
Foram carregados:
	 observações: 109
	 atributos: 8

Carregando os dados de palate
Foram carregados:
	 observações: 109
	 atributos: 15

Carregando os dados de finish
Foram carregados:
	 observações: 109
	 atributos: 19



In [6]:
coord_df = (pd.read_csv("data/DISTCOOR.TXT", skiprows=7, header=None)[0].
            str.strip().str.split("\s{3,}", expand=True))
coord_df.columns = ["name", "coord_x", "coord_y"]
coord_df["coord_x"] = coord_df["coord_x"].astype("float")
coord_df["coord_y"] = coord_df["coord_x"].astype("float")

print(coord_df.info())
coord_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     109 non-null    object 
 1   coord_x  109 non-null    float64
 2   coord_y  109 non-null    float64
dtypes: float64(2), object(1)
memory usage: 2.7+ KB
None


Unnamed: 0,name,coord_x,coord_y
0,Aberfeldy,3.875,3.875
1,Aberlour,3.22,3.22
2,Ardberg,6.08,6.08
3,Ardmore,2.67,2.67
4,Auchentoshan,4.39,4.39


In [7]:
df_list.append(coord_df)

In [8]:
region_labels = {1: "Highlands", 2: "Islay", 3: "Lowlands"}

regions_df = pd.read_csv("data/REGIONS.TXT", header=None, skipfooter=6, engine="python")
regions_df.columns = ["region"]
regions_df["region"] = regions_df["region"].map(region_labels).astype("category")

print(regions_df.info())
regions_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   region  109 non-null    category
dtypes: category(1)
memory usage: 369.0 bytes
None


Unnamed: 0,region
0,Highlands
1,Highlands
2,Islay
3,Highlands
4,Lowlands


In [9]:
df_list.append(regions_df)

In [10]:
df = pd.concat(df_list, axis=1)
df.set_index("name", inplace=True)
df.head()

Unnamed: 0_level_0,color_white-wine,color_yellow,color_very-pale,color_pale,color_pale-gold,color_gold,color_old-gold,color_full-gold,color_bronze,color_pale-amber,...,finish_oily,finish_salty,finish_aromatic,finish_quick,finish_long,finish_very-long,finish_lingering,coord_x,coord_y,region
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aberfeldy,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.875,3.875,Highlands
Aberlour,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,3.22,3.22,Highlands
Ardberg,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,6.08,6.08,Islay
Ardmore,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2.67,2.67,Highlands
Auchentoshan,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,4.39,4.39,Lowlands


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109 entries, Aberfeldy to Tullibardine
Data columns (total 71 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   color_white-wine   109 non-null    int64   
 1   color_yellow       109 non-null    int64   
 2   color_very-pale    109 non-null    int64   
 3   color_pale         109 non-null    int64   
 4   color_pale-gold    109 non-null    int64   
 5   color_gold         109 non-null    int64   
 6   color_old-gold     109 non-null    int64   
 7   color_full-gold    109 non-null    int64   
 8   color_bronze       109 non-null    int64   
 9   color_pale-amber   109 non-null    int64   
 10  color_amber        109 non-null    int64   
 11  color_full-amber   109 non-null    int64   
 12  color_red          109 non-null    int64   
 13  color_fino-sherry  109 non-null    int64   
 14  nose_aromatic      109 non-null    int64   
 15  nose_peaty         109 non-null    int64   
 

In [12]:
df.to_pickle("data/data.pickle")