In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from omegaconf import OmegaConf

from typing import Tuple

from imblearn.over_sampling import RandomOverSampler

from boozie.get_data import load_wine

In [3]:
cfg = OmegaConf.load("../boozie/conf/config.yaml")

In [4]:
df = load_wine()

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,stars
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2.0


In [6]:
df.stars.value_counts()

stars
2.0    2836
3.0    2836
4.0    2836
1.0    2836
5.0    2836
Name: count, dtype: int64

In [7]:
ros = RandomOverSampler(random_state=42)
X = df.copy()
y = X.pop("stars")
df_os = pd.concat(ros.fit_resample(X, y), axis=1)

df_os["stars"].value_counts()

stars
2.0    2836
3.0    2836
4.0    2836
1.0    2836
5.0    2836
Name: count, dtype: int64

In [None]:
def transform_to_star_score(scores: pd.Series) -> pd.Series:
    result = pd.Series(index=scores.index)
    result[scores.isin(range(5))] = 1
    result[scores == 5] = 2
    result[scores == 6] = 3
    result[scores == 7] = 4
    result[scores > 7] = 5

    return result

transform_to_star_score(df.quality).value_counts()

3.0    2836
2.0    2138
4.0    1079
1.0     246
5.0     198
Name: count, dtype: int64

In [None]:
df["stars"] = transform_to_star_score(df.quality)

In [None]:
def extract_samples(
    df: pd.DataFrame,*, score_name: str, samples: dict, random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    df = df.copy()
    samples_df = pd.DataFrame()

    for name, score in samples.items():
        subset = df[df[score_name] == score]

        record = (subset.sample(random_state=random_state)
                        .assign(name=name))
        df = df.drop(record.index)
        samples_df = pd.concat([samples_df, record])

    return samples_df, df

In [None]:
samples_df, tdf = extract_samples(df, score_name="stars", samples=cfg.samples)

In [None]:
samples_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,stars,name
2735,6.0,0.26,0.42,5.2,0.027,70.0,178.0,0.9914,3.4,0.4,12.3,8,5.0,Schmidt
2820,5.9,0.4,0.32,6.0,0.034,50.0,127.0,0.992,3.51,0.58,12.5,7,4.0,Penfolds
1114,5.0,0.4,0.5,4.3,0.046,29.0,80.0,0.9902,3.49,0.66,13.6,6,3.0,Antinori
107,6.2,0.63,0.31,1.7,0.088,15.0,64.0,0.9969,3.46,0.79,9.3,5,2.0,Vega Sicilia
876,7.1,0.47,0.0,2.2,0.067,7.0,14.0,0.99517,3.4,0.58,10.9,4,1.0,S-Budget


In [None]:
tdf.loc[2420, :]

fixed acidity             7.3000
volatile acidity          0.6550
citric acid               0.2000
residual sugar           10.2000
chlorides                 0.0710
free sulfur dioxide      28.0000
total sulfur dioxide    212.0000
density                   0.9971
pH                        2.9600
sulphates                 0.5800
alcohol                   9.2000
quality                   6.0000
stars                     3.0000
Name: 2420, dtype: float64