Classical cls-s on the winemag data were explored. 

(0) Setup

In [1]:
!git clone https://github.com/kkonstantin182/wine-classifier.git
%cd wine-classifier
!pip install -r requirements.txt

Cloning into 'wine-classifier'...
remote: Enumerating objects: 164, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 164 (delta 0), reused 0 (delta 0), pack-reused 159[K
Receiving objects: 100% (164/164), 2.75 MiB | 19.29 MiB/s, done.
Resolving deltas: 100% (86/86), done.
/content/wine-classifier
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode==1.3.6 (from -r requirements.txt (line 1))
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bornrule==0.1.3 (from -r requirements.txt (line 2))
  Downloading bornrule-0.1.3-py3-none-any.whl (27 kB)
Collecting scikit_learn==1.2.1 (from -r requirements.txt (line 5))
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)

In [2]:
# Custom packages

import sys
from notebooks.path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

from src.text_processing import TextProcessing, Vectorization
from src.dataset import Dataset
from src.constants import SEED

In [3]:
from google.colab import files

In [4]:
# Libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
from bornrule import BornClassifier
import multiprocessing as mp
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

# Other 

nltk.download('punkt') # Tokenization
!python -m spacy download it_core_news_sm # Lemmatization, stop words



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')


(I) Data

In [5]:
# Columns to use

COLUMNS = {
    'target': ['type'],
    'text': ['review', 'winery', 'variety'],
    'numerical': ['price'],
    'categorical': ['appellation2']
}

In [6]:
# Object for data cleaning
# Also has tokenization inside

tp_obj_clean = TextProcessing(is_lemmatized=True)


In [9]:
# Loading data

train_set = pd.read_csv(get_project_root() / "data" / "wm_data_train_text_cleaned.csv", index_col=False)
test_set = pd.read_csv(get_project_root() / "data" / "wm_data_test_text_cleaned.csv", index_col=False)

print(train_set.shape)
print(test_set.shape)

(12000, 4)
(3000, 4)


In [10]:
X_train, X_test = train_set.drop("target", axis=1), test_set.drop("target", axis=1)
y_train, y_test = train_set['target'], test_set['target']

# (II) Experiments


In [11]:
# Define transformation pipeline

# Feature trans-n

num_tranfsformer = Pipeline(
    steps=[("scaler", MinMaxScaler()),
           
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        
    ]
)

# For Born Rule the input cannot be negative
# Hence, we don't use numerical tran-s since on the test set 
# it produces negative numbers


preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
         ("num", num_tranfsformer, COLUMNS['numerical']),
        ("cat", cat_transformer, COLUMNS['categorical']),
        
    ]
)

In [13]:
!pip install catboost
import catboost as cb

# Define pipelines for classifiers
pipelines = [
    ('CatBoost', Pipeline([
        ('preprocessor1', preprocessor),
        ('clf', cb.CatBoostClassifier(random_state=SEED, iterations=30))
    ])),

]

# Define hyperparameters for grid search
hyperparameters = {
    'CatBoost': {
        'clf__iterations': [15],
        'clf__learning_rate': [0.001, 0.01, 0.1],
        'clf__depth': [6, 8, 10],
    },

}

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [14]:
# Define table to store results
results_table = pd.DataFrame(columns=[
    'Classifier', 
    'Hyperparameters', 
    'Train Accuracy', 
    'Train Precision', 
    'Train Recall', 
    'Train F1-score',
    'Test Accuracy', 
    'Test Precision', 
    'Test Recall', 
    'Test F1-score'])

# Train and evaluate models
for clf_name, pipeline in tqdm(pipelines, desc="Classifiers", total=len(pipelines)):
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    # rs = GridSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on train data
    y_train_pred = rs.predict(X_train)
    
    # Compute metrics on train data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision, train_recall, train_fscore, train_support = precision_recall_fscore_support(y_train, y_train_pred, average='macro')
    
    # Make predictions on test data
    y_test_pred = rs.predict(X_test)
    
    # Compute metrics on test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision, test_recall, test_fscore, test_support = precision_recall_fscore_support(y_test, y_test_pred, average='macro')

    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Train Accuracy': train_accuracy,
        'Train Precision': train_precision,
        'Train Recall': train_recall,
        'Train F1-score': train_fscore,
        'Test Accuracy': test_accuracy,
        'Test Precision': test_precision,
        'Test Recall': test_recall,
        'Test F1-score': test_fscore
    }, ignore_index=True)
    
    # Print progress bar
    remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
    print(f"{remaining_iters} iterations left")
    print("---------------------------------------------------------")

# Keep in mind that by default refit = True
# So, all metrics in the table are for the best found parameters 
  
# Print final results table
print("\nResults table:")
print(results_table)
results_table.to_csv('vh_cat_boost_hp_results.csv')
files.download('vh_cat_boost_hp_results.csv')


Classifiers:   0%|          | 0/1 [00:00<?, ?it/s]

Training CatBoost




0:	learn: 1.1542700	total: 5.73s	remaining: 1m 20s
1:	learn: 0.9943362	total: 12.7s	remaining: 1m 22s
2:	learn: 0.8698371	total: 18.3s	remaining: 1m 13s
3:	learn: 0.7730355	total: 25.2s	remaining: 1m 9s
4:	learn: 0.6932806	total: 30.8s	remaining: 1m 1s
5:	learn: 0.6262100	total: 37.7s	remaining: 56.6s
6:	learn: 0.5702472	total: 43.3s	remaining: 49.5s
7:	learn: 0.5194465	total: 49.9s	remaining: 43.7s
8:	learn: 0.4745102	total: 55.8s	remaining: 37.2s
9:	learn: 0.4367419	total: 1m 1s	remaining: 31s
10:	learn: 0.4046736	total: 1m 8s	remaining: 24.9s
11:	learn: 0.3742679	total: 1m 14s	remaining: 18.5s
12:	learn: 0.3461665	total: 1m 20s	remaining: 12.4s
13:	learn: 0.3241761	total: 1m 26s	remaining: 6.17s
14:	learn: 0.3013403	total: 1m 33s	remaining: 0us


  results_table = results_table.append({
Classifiers: 100%|██████████| 1/1 [30:23<00:00, 1823.10s/it]

0 iterations left
---------------------------------------------------------

Results table:
  Classifier                                    Hyperparameters  \
0   CatBoost  {'clf__learning_rate': 0.1, 'clf__iterations':...   

   Train Accuracy  Train Precision  Train Recall  Train F1-score  \
0         0.95825         0.959958      0.882242         0.91394   

   Test Accuracy  Test Precision  Test Recall  Test F1-score  
0       0.954667         0.95513     0.866536       0.901858  





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
results_table

Unnamed: 0,Classifier,Hyperparameters,Train Accuracy,Train Precision,Train Recall,Train F1-score,Test Accuracy,Test Precision,Test Recall,Test F1-score
0,CatBoost,"{'clf__learning_rate': 0.1, 'clf__iterations':...",0.95825,0.959958,0.882242,0.91394,0.954667,0.95513,0.866536,0.901858


In [18]:
print(results_table['Hyperparameters'][0])

{'clf__learning_rate': 0.1, 'clf__iterations': 15, 'clf__depth': 10}


## Without hyper-s

In [19]:
# Without hyp-s tuning
pipelines[0][1].fit(X_train, y_train)



Learning rate set to 0.5
0:	learn: 0.5947023	total: 2.48s	remaining: 1m 11s
1:	learn: 0.3819575	total: 3.22s	remaining: 45.1s
2:	learn: 0.2760384	total: 3.72s	remaining: 33.5s
3:	learn: 0.2195868	total: 4.13s	remaining: 26.8s
4:	learn: 0.1865148	total: 4.52s	remaining: 22.6s
5:	learn: 0.1558317	total: 4.94s	remaining: 19.8s
6:	learn: 0.1440604	total: 5.33s	remaining: 17.5s
7:	learn: 0.1334653	total: 5.71s	remaining: 15.7s
8:	learn: 0.1223945	total: 6.12s	remaining: 14.3s
9:	learn: 0.1149819	total: 6.65s	remaining: 13.3s
10:	learn: 0.1129047	total: 7.41s	remaining: 12.8s
11:	learn: 0.1078879	total: 8.1s	remaining: 12.2s
12:	learn: 0.1073103	total: 8.95s	remaining: 11.7s
13:	learn: 0.1050447	total: 9.84s	remaining: 11.2s
14:	learn: 0.1036420	total: 10.6s	remaining: 10.6s
15:	learn: 0.1020995	total: 11.1s	remaining: 9.72s
16:	learn: 0.1014274	total: 11.7s	remaining: 8.98s
17:	learn: 0.0960697	total: 12.4s	remaining: 8.25s
18:	learn: 0.0929395	total: 12.9s	remaining: 7.44s
19:	learn: 0.091

In [20]:
# Make predictions on train data
y_train_pred_cb = pipelines[0][1].predict(X_train)
# Compute metrics on train data
train_accuracy_cb  = accuracy_score(y_train, y_train_pred_cb)
train_precision_cb , train_recall_cb , train_fscore_cb , train_support_cb  = precision_recall_fscore_support(y_train, y_train_pred_cb, average='macro')

print('CatBoost with the dafault val-s of hyp-s:')
print('Train Accuracy is: ', str(train_accuracy_cb))
print(
    'Train precision, recall, f1, support: ', 
    str(train_precision_cb), " ",
    str(train_recall_cb),  " ",
    str(train_fscore_cb), " ",
    str(train_support_cb))

CatBoost with the dafault val-s of hyp-s:
Train Accuracy is:  0.9795833333333334
Train precision, recall, f1, support:  0.9769598160272716   0.9302791207594259   0.9511802177667377   None


In [21]:
# Compute metrics on test data
y_test_pred_cb = pipelines[0][1].predict(X_test)
test_accuracy_cb = accuracy_score(y_test, y_test_pred_cb)
test_precision_cb, test_recall_cb, test_fscore_cb, test_support_cb = precision_recall_fscore_support(y_test, y_test_pred_cb, average='macro')

print('CatBoost with the dafault val-s of hyp-s:')
print('Test Accuracy is: ', str(test_accuracy_cb))
print(
    'Test precision, recall, f1, support: ', 
    str(test_precision_cb), " ",
    str(test_recall_cb),  " ",
    str(test_fscore_cb), " ",
    str(test_support_cb))

CatBoost with the dafault val-s of hyp-s:
Test Accuracy is:  0.9783333333333334
Test precision, recall, f1, support:  0.9785971355566424   0.9103298004806172   0.9387619889947516   None
