Classical cls-s on the winemag data were explored. 

(0) Setup

In [None]:
!git clone https://github.com/kkonstantin182/wine-classifier.git
%cd wine-classifier
!pip install -r requirements.txt

Cloning into 'wine-classifier'...
remote: Enumerating objects: 142, done.[K
remote: Counting objects: 100% (142/142), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 142 (delta 79), reused 75 (delta 24), pack-reused 0[K
Receiving objects: 100% (142/142), 2.68 MiB | 5.53 MiB/s, done.
Resolving deltas: 100% (79/79), done.
/content/wine-classifier
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode==1.3.6 (from -r requirements.txt (line 1))
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bornrule==0.1.3 (from -r requirements.txt (line 2))
  Downloading bornrule-0.1.3-py3-none-any.whl (27 kB)
Collecting scikit_learn==1.2.1 (from -r requirements.txt (line 5))
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [None]:
# Custom packages

import sys
from notebooks.path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

from src.text_processing import TextProcessing, Vectorization
from src.dataset import Dataset
from src.constants import SEED

In [None]:
from google.colab import files

In [None]:
# Libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
from bornrule import BornClassifier
import multiprocessing as mp
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
# Other 

nltk.download('punkt') # Tokenization
!python -m spacy download it_core_news_sm # Lemmatization, stop words



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')


In [None]:
from sklearn.compose import make_column_selector, make_column_transformer

(I) Data

In [None]:
# Columns to use

TARGET_MAP = {
    # Put here the map for the target variable.
    # Removed due to the privacy reasons.
}

COLUMNS = {
    'target': ['character'],
    'text': ['text'],
    'numerical': ['alcohol', 'wine_name_length', 'n_grapes'],
    'categorical': ['category', 'region', 'is_complex_grape', 'alcohol_level']
}

In [None]:
# Object for data cleaning
# As the data is cleaned already, only tokenizer will be used

tp_obj_clean = TextProcessing(is_lemmatized=True)

In [None]:
# Loading data

train_set = pd.read_csv(r"/content/vh_train.csv", index_col=False)
test_set = pd.read_csv(r"/content/vh_test.csv", index_col=False)

print(train_set.shape)
print(test_set.shape)

(4016, 9)
(1005, 9)


In [None]:
train_set.columns

Index(['target', 'text', 'alcohol', 'wine_name_length', 'n_grapes', 'category',
       'region', 'is_complex_grape', 'alcohol_level'],
      dtype='object')

In [None]:
X_train, X_test = train_set.drop("target", axis=1), test_set.drop("target", axis=1)
y_train, y_test = train_set['target'], test_set['target']

# (II) Experiments


In [None]:
class AlcoholImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_column='category', impute_column1='alcohol', impute_column2='alcohol_level'):

        self.group_column = group_column
        self.impute_column1 = impute_column1
        self.impute_column2 = impute_column2
        self.transformed_columns_ = []

   
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed[self.impute_column1] = X.groupby(self.group_column)[self.impute_column1].apply(lambda x: x.fillna(x.median()))
        X_transformed[self.impute_column2] = X_transformed[self.impute_column1].apply(self.impute_alco_level)
        self.transformed_columns_ = X_transformed.columns.difference(X.columns).tolist()
        return pd.DataFrame(X_transformed, columns=X.columns, index=X.index)

    def impute_alco_level(self, x):
        if x < 11.5: level = 'low'
        elif 11.5 <= x < 13.5: level = 'medium'
        elif x >= 13.5: level = 'high'
        else: level = np.nan
        return level

    
    def get_params(self, deep=True):
        return {
            'group_column': self.group_column, 
            'impute_column1': self.impute_column1,
            'impute_column2': self.impute_column2}

    def set_params(self, **params):
        self.group_column = params['group_column']
        self.impute_column1 = params['impute_column1']
        self.impute_column2 = params['impute_column2']
        return self
    
    def get_feature_names_out(self, input_features=None):
        return self.transformed_columns_


In [None]:
# Fill NaN in the alcohol column
alcohol_enc = AlcoholImputer().fit(X_train)
X_train = alcohol_enc.transform(X_train)
X_test = alcohol_enc.transform(X_test)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  X_transformed[self.impute_column1] = X.groupby(self.group_column)[self.impute_column1].apply(lambda x: x.fillna(x.median()))
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  X_transformed[self.impute_column1] = X.groupby(self.group_column)[self.impute_column1].apply(lambda x: x.fillna(x.median()))


In [None]:
# For all cls-s but not Born Rule
preprocessor1 = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
        ("num", MinMaxScaler(),  COLUMNS['numerical']),
        ("cat", OneHotEncoder(handle_unknown="ignore"), COLUMNS['categorical']),
        
    ]
)

# For Born Rule
preprocessor2 = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
        ("cat", OneHotEncoder(handle_unknown="ignore"), COLUMNS['categorical']),
        
    ],
    remainder='passthrough'
)

In [None]:
# Define pipelines for classifiers
pipelines = [
    ('Logistic Regression', Pipeline([
        ('preprocessor1', preprocessor2),
        ('clf', LogisticRegression(random_state=SEED))
    ])),

    ('SVM', Pipeline([
        ('preprocessor1', preprocessor2),
        ('clf', SVC(random_state=SEED))
    ])),

    ('Random Forest', Pipeline([
        ('preprocessor1', preprocessor2),
        ('clf', RandomForestClassifier(random_state=SEED))
    ])),

    ('Born Rule', Pipeline([
        ('preprocessor2', preprocessor1),
        ('clf', BornClassifier())
    ]))

]

# Define hyperparameters for grid search
hyperparameters = {
    'Logistic Regression': {
        'clf__solver': ['saga'],
        'clf__penalty': ['l1', 'l2', None],
        'clf__C': [0.01, 0.1, 1, 10],
    },

    'SVM': {
        'clf__C': [0.01, 0.1, 1, 10, 100],
        'clf__kernel': ['linear', 'rbf']
    },
    
    'Random Forest': {
        'clf__n_estimators': [10, 100, 1000],
        'clf__max_depth': [10, 100, None],
        'clf__bootstrap': [True, False],
        'clf__min_samples_split': [2, 10, 100],
    },

    'Born Rule': {
        'clf__a': [0.25, 0.5, 1.0, 4.0], # Cannot be 0
        'clf__b': [.0, 0.25, 0.5, 1.0, 4.0],
        'clf__h': [.0, 0.25, 0.5, 1.0, 4.0],
        
    }
}

In [None]:
# Define table to store results
results_table = pd.DataFrame(columns=[
    'Classifier', 
    'Hyperparameters', 
    'Train Accuracy', 
    'Train Precision', 
    'Train Recall', 
    'Train F1-score',
    'Test Accuracy', 
    'Test Precision', 
    'Test Recall', 
    'Test F1-score'])

# Train and evaluate models
for clf_name, pipeline in tqdm(pipelines, desc="Classifiers", total=len(pipelines)):
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    # rs = GridSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on train data
    y_train_pred = rs.predict(X_train)
    
    # Compute metrics on train data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision, train_recall, train_fscore, train_support = precision_recall_fscore_support(y_train, y_train_pred, average='macro')
    
    # Make predictions on test data
    y_test_pred = rs.predict(X_test)
    
    # Compute metrics on test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision, test_recall, test_fscore, test_support = precision_recall_fscore_support(y_test, y_test_pred, average='macro')

    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Train Accuracy': train_accuracy,
        'Train Precision': train_precision,
        'Train Recall': train_recall,
        'Train F1-score': train_fscore,
        'Test Accuracy': test_accuracy,
        'Test Precision': test_precision,
        'Test Recall': test_recall,
        'Test F1-score': test_fscore
    }, ignore_index=True)
    
    # Print progress bar
    remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
    print(f"{remaining_iters} iterations left")
    print("---------------------------------------------------------")

# Keep in mind that by default refit = True
# So, all metrics in the table are for the best found parameters 
  
# Print final results table
print("\nResults table:")
print(results_table)
results_table.to_csv('vh_class_cls_hp_results.csv')
files.download('vh_class_cls_hp_results.csv')


Classifiers:   0%|          | 0/4 [00:00<?, ?it/s]

Training Logistic Regression


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  results_table = results_table.append({
Classifiers:  25%|██▌       | 1/4 [03:30<10:31, 210.50s/it]

3 iterations left
---------------------------------------------------------
Training SVM


  results_table = results_table.append({
Classifiers:  50%|█████     | 2/4 [09:47<10:17, 308.59s/it]

2 iterations left
---------------------------------------------------------
Training Random Forest


  _warn_prf(average, modifier, msg_start, len(result))
  results_table = results_table.append({
Classifiers:  75%|███████▌  | 3/4 [13:18<04:23, 263.84s/it]

1 iterations left
---------------------------------------------------------
Training Born Rule


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  results_table = results_table.append({
Classifiers: 100%|██████████| 4/4 [13:50<00:00, 207.52s/it]

0 iterations left
---------------------------------------------------------

Results table:
            Classifier                                    Hyperparameters  \
0  Logistic Regression  {'clf__solver': 'saga', 'clf__penalty': 'l2', ...   
1                  SVM             {'clf__kernel': 'linear', 'clf__C': 1}   
2        Random Forest  {'clf__n_estimators': 100, 'clf__min_samples_s...   
3            Born Rule      {'clf__h': 1.0, 'clf__b': 0.5, 'clf__a': 1.0}   

   Train Accuracy  Train Precision  Train Recall  Train F1-score  \
0        0.576942         0.564503      0.477074        0.469817   
1        0.869273         0.907208      0.877787        0.890405   
2        0.995269         0.995644      0.988529        0.991907   
3        0.505727         0.515209      0.419817        0.365845   

   Test Accuracy  Test Precision  Test Recall  Test F1-score  
0       0.536318        0.531377     0.444078       0.430960  
1       0.615920        0.609857     0.576258       0.5




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results_table

Unnamed: 0,Classifier,Hyperparameters,Train Accuracy,Train Precision,Train Recall,Train F1-score,Test Accuracy,Test Precision,Test Recall,Test F1-score
0,Logistic Regression,"{'clf__solver': 'saga', 'clf__penalty': 'l2', ...",0.576942,0.564503,0.477074,0.469817,0.536318,0.531377,0.444078,0.43096
1,SVM,"{'clf__kernel': 'linear', 'clf__C': 1}",0.869273,0.907208,0.877787,0.890405,0.61592,0.609857,0.576258,0.584171
2,Random Forest,"{'clf__n_estimators': 100, 'clf__min_samples_s...",0.995269,0.995644,0.988529,0.991907,0.602985,0.607708,0.535319,0.544353
3,Born Rule,"{'clf__h': 1.0, 'clf__b': 0.5, 'clf__a': 1.0}",0.505727,0.515209,0.419817,0.365845,0.472637,0.46401,0.392031,0.325513
