In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import pandas as pd
from omegaconf import OmegaConf

from typing import Tuple

from imblearn.over_sampling import RandomOverSampler

from boozie.get_data import load_wine, add_fake_features

In [3]:
cfg = OmegaConf.load("../boozie/conf/config.yaml")

In [4]:
df = load_wine()

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,stars
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2.0


In [8]:
df = add_fake_features(df)

In [9]:
for col in df.columns:
    print("-", col)

- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol
- stars
- price
- kcal
- tannin level
- aging potential
- production volume
- sediment volume
- opacity
- magnesium oxide
- total potassium
- ph


In [15]:
ros = RandomOverSampler(random_state=42)
X = df.copy()
y = X.pop("stars")
df_os = pd.concat(ros.fit_resample(X, y), axis=1)

df_os["stars"].value_counts()

stars
2.0    2836
3.0    2836
4.0    2836
1.0    2836
5.0    2836
Name: count, dtype: int64

In [16]:
def extract_samples(
    df: pd.DataFrame,*, score_name: str, samples: dict, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    df = df.copy()
    samples_df = pd.DataFrame()

    for name, score in samples.items():
        subset = df[df[score_name] == score]

        record = (subset.sample(random_state=random_state)
                        .assign(name=name))
        df = df.drop(record.index)
        samples_df = pd.concat([samples_df, record])

    return samples_df, df

In [17]:
samples_df, tdf = extract_samples(df, score_name="stars", samples=cfg.samples)

In [18]:
samples_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,stars,name
11759,5.5,0.49,0.03,1.8,0.044,28.0,87.0,0.9908,3.5,0.82,14.0,5.0,Schmidt
2775,6.3,0.32,0.35,11.1,0.039,29.0,198.0,0.9984,3.36,0.5,9.4,4.0,Penfolds
1114,5.0,0.4,0.5,4.3,0.046,29.0,80.0,0.9902,3.49,0.66,13.6,3.0,Antinori
843,7.1,0.685,0.35,2.0,0.088,9.0,92.0,0.9963,3.28,0.62,9.4,2.0,Vega Sicilia
6666,7.6,0.21,0.35,1.2,0.041,7.0,106.0,0.9914,3.06,0.45,11.3,1.0,S-Budget


In [19]:
tdf.loc[2420, :]

fixed acidity             7.3000
volatile acidity          0.6550
citric acid               0.2000
residual sugar           10.2000
chlorides                 0.0710
free sulfur dioxide      28.0000
total sulfur dioxide    212.0000
density                   0.9971
pH                        2.9600
sulphates                 0.5800
alcohol                   9.2000
stars                     3.0000
Name: 2420, dtype: float64

In [23]:
for col in tdf.columns:
    print("-", col)

- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol
- stars
