Classical cls-s on the winemag data were explored. 

(0) Setup

In [1]:
!git clone https://github.com/kkonstantin182/wine-classifier.git
%cd wine-classifier
!pip install -r requirements.txt

Cloning into 'wine-classifier'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 125 (delta 71), reused 60 (delta 21), pack-reused 0[K
Receiving objects: 100% (125/125), 2.49 MiB | 3.36 MiB/s, done.
Resolving deltas: 100% (71/71), done.
/content/wine-classifier
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode==1.3.6
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bornrule==0.1.3
  Downloading bornrule-0.1.3-py3-none-any.whl (27 kB)
Collecting scikit_learn==1.2.1
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m44.7 MB/s[0m eta

In [2]:
# Custom packages

import sys
from notebooks.path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

from src.text_processing import TextProcessing, Vectorization
from src.dataset import Dataset
from src.constants import SEED

In [38]:
from google.colab import files

In [45]:
# Libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
from bornrule import BornClassifier
import multiprocessing as mp
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

# Other 

nltk.download('punkt') # Tokenization
!python -m spacy download it_core_news_sm # Lemmatization, stop words



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')


(I) Data

In [9]:
# Columns to use

COLUMNS = {
    'target': ['type'],
    'text': ['review', 'winery', 'variety'],
    'numerical': ['price'],
    'categorical': ['appellation2']
}

In [10]:
# Object for data cleaning
# Also has tokenization inside

tp_obj_clean = TextProcessing(is_lemmatized=True)


In [11]:
# Loading data

train_set = pd.read_csv(get_project_root() / "data" / "dataset1_train_text_cleaned.csv", index_col=False)
test_set = pd.read_csv(get_project_root() / "data" / "dataset1_test_text_cleaned.csv", index_col=False)

print(train_set.shape)
print(test_set.shape)

(12000, 4)
(3000, 4)


In [12]:
X_train, X_test = train_set.drop("target", axis=1), test_set.drop("target", axis=1)
y_train, y_test = train_set['target'], test_set['target']

# (II) Experiments


In [41]:
# Define transformation pipeline

# Feature trans-n

num_tranfsformer = Pipeline(
    steps=[("scaler", MinMaxScaler()),
           
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        
    ]
)

# For Born Rule the input cannot be negative
# Hence, we don't use numerical tran-s since on the test set 
# it produces negative numbers

preprocessor1 = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
         ("num", 'passthrough', COLUMNS['numerical']),
        ("cat", cat_transformer, COLUMNS['categorical']),
        
    ]
)

preprocessor2 = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
         ("num", num_tranfsformer, COLUMNS['numerical']),
        ("cat", cat_transformer, COLUMNS['categorical']),
        
    ]
)

In [42]:
# Define pipelines for classifiers
pipelines = [
    ('Logistic Regression', Pipeline([
        ('preprocessor', preprocessor2),
        ('clf', LogisticRegression(random_state=SEED))
    ])),

    ('SVM', Pipeline([
        ('preprocessor', preprocessor2),
        ('clf', SVC(random_state=SEED))
    ])),

    ('Random Forest', Pipeline([
        ('preprocessor', preprocessor2),
        ('clf', RandomForestClassifier(random_state=SEED))
    ])),

    ('Born Rule', Pipeline([
        ('preprocessor', preprocessor1),
        ('clf', BornClassifier())
    ]))

]

# Define hyperparameters for grid search
hyperparameters = {
    'Logistic Regression': {
        'clf__solver': ['saga'],
        'clf__penalty': ['l1', 'l2', None],
        'clf__C': [0.01, 0.1, 1, 10],
    },

    'SVM': {
        'clf__C': [0.01, 0.1, 1, 10, 100],
        'clf__kernel': ['linear', 'rbf']
    },
    
    'Random Forest': {
        'clf__n_estimators': [10, 100, 1000],
        'clf__max_depth': [10, 100, None],
        'clf__bootstrap': [True, False],
        'clf__min_samples_split': [2, 10, 100],
    },

    'Born Rule': {
        'clf__a': [0.25, 0.5, 1.0, 4.0], # Cannot be 0
        'clf__b': [.0, 0.25, 0.5, 1.0, 4.0],
        'clf__h': [.0, 0.25, 0.5, 1.0, 4.0],
        
    }
}

In [43]:
# # Define table to store results
# results_table = pd.DataFrame(columns=[
#     'Classifier', 
#     'Hyperparameters', 
#     'Accuracy', 
#     'Precision', 
#     'Recall', 
#     'F1-score'])

# # Train and evaluate models
# for clf_name, pipeline in tqdm(pipelines, desc="Classifiers", total=len(pipelines)):
#     print("Training", clf_name)
#     clf = pipeline.named_steps['clf']
#     hyperparams = hyperparameters[clf_name]
#     # rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
#     rs = GridSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
#     rs.fit(X_train, y_train)
    
#     # Make predictions on train data
#     y_pred = rs.predict(X_train)
    
#     # Compute metrics
#     accuracy = accuracy_score(y_train, 
#                               y_pred)
#     precision,recall,fscore,support=precision_recall_fscore_support(y_train, 
#                                                                     y_pred,
#                                                                     average='macro')

#     # Store results in table
#     results_table = results_table.append({
#         'Classifier': clf_name,
#         'Hyperparameters': rs.best_params_,
#         'Accuracy': accuracy,
#         'Precision': precision,
#         'Recall': recall,
#         'F1-score': fscore
#     }, ignore_index=True)
    
#     # Print progress bar
#     remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
#     print(f"{remaining_iters} iterations left")
#     print("---------------------------------------------------------")

#     # print(report)

# # Keep in mind that by default refit = True
# # So, all metrics in the table are for the best found parameters 

# # Print final results table
# print("\nResults table:")
# print(results_table)
# results_table.to_csv('class_cls_hp_results.csv')
# files.download('class_cls_hp_results.csv')

Classifiers:   0%|          | 0/4 [00:00<?, ?it/s]

Training Logistic Regression


  results_table = results_table.append({
Classifiers:  25%|██▌       | 1/4 [04:05<12:16, 245.58s/it]

3 iterations left
---------------------------------------------------------
Training SVM


  results_table = results_table.append({
Classifiers:  50%|█████     | 2/4 [18:27<20:16, 608.12s/it]

2 iterations left
---------------------------------------------------------
Training Random Forest


  results_table = results_table.append({
Classifiers:  75%|███████▌  | 3/4 [27:54<09:49, 589.49s/it]

1 iterations left
---------------------------------------------------------
Training Born Rule


  results_table = results_table.append({
Classifiers: 100%|██████████| 4/4 [30:08<00:00, 452.15s/it]

0 iterations left
---------------------------------------------------------

Results table:
            Classifier                                    Hyperparameters  \
0  Logistic Regression  {'clf__solver': 'saga', 'clf__penalty': 'l1', ...   
1                  SVM             {'clf__kernel': 'linear', 'clf__C': 1}   
2        Random Forest  {'clf__n_estimators': 1000, 'clf__min_samples_...   
3            Born Rule    {'clf__h': 0.25, 'clf__b': 1.0, 'clf__a': 0.25}   

   Accuracy  Precision    Recall  F1-score  
0  0.999750   0.999694  0.999679  0.999686  
1  0.997333   0.995755  0.992626  0.994179  
2  1.000000   1.000000  1.000000  1.000000  
3  0.954250   0.849033  0.972958  0.880694  





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
# Define table to store results
results_table = pd.DataFrame(columns=[
    'Classifier', 
    'Hyperparameters', 
    'Train Accuracy', 
    'Train Precision', 
    'Train Recall', 
    'Train F1-score',
    'Test Accuracy', 
    'Test Precision', 
    'Test Recall', 
    'Test F1-score'])

# Train and evaluate models
for clf_name, pipeline in tqdm(pipelines, desc="Classifiers", total=len(pipelines)):
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    # rs = GridSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on train data
    y_train_pred = rs.predict(X_train)
    
    # Compute metrics on train data
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision, train_recall, train_fscore, train_support = precision_recall_fscore_support(y_train, y_train_pred, average='macro')
    
    # Make predictions on test data
    y_test_pred = rs.predict(X_test)
    
    # Compute metrics on test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision, test_recall, test_fscore, test_support = precision_recall_fscore_support(y_test, y_test_pred, average='macro')

    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Train Accuracy': train_accuracy,
        'Train Precision': train_precision,
        'Train Recall': train_recall,
        'Train F1-score': train_fscore,
        'Test Accuracy': test_accuracy,
        'Test Precision': test_precision,
        'Test Recall': test_recall,
        'Test F1-score': test_fscore
    }, ignore_index=True)
    
    # Print progress bar
    remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
    print(f"{remaining_iters} iterations left")
    print("---------------------------------------------------------")

# Keep in mind that by default refit = True
# So, all metrics in the table are for the best found parameters 
  
# Print final results table
print("\nResults table:")
print(results_table)
results_table.to_csv('class_cls_hp_results.csv')
files.download('class_cls_hp_results.csv')


Classifiers:   0%|          | 0/4 [00:00<?, ?it/s]

Training Logistic Regression


  results_table = results_table.append({
Classifiers:  25%|██▌       | 1/4 [03:59<11:59, 239.97s/it]

3 iterations left
---------------------------------------------------------
Training SVM


  results_table = results_table.append({
Classifiers:  50%|█████     | 2/4 [18:37<20:30, 615.09s/it]

2 iterations left
---------------------------------------------------------
Training Random Forest


  results_table = results_table.append({
Classifiers:  75%|███████▌  | 3/4 [28:51<10:14, 614.56s/it]

1 iterations left
---------------------------------------------------------
Training Born Rule


  results_table = results_table.append({
Classifiers: 100%|██████████| 4/4 [31:10<00:00, 467.57s/it]

0 iterations left
---------------------------------------------------------

Results table:
            Classifier                                    Hyperparameters  \
0  Logistic Regression  {'clf__solver': 'saga', 'clf__penalty': 'l1', ...   
1                  SVM             {'clf__kernel': 'linear', 'clf__C': 1}   
2        Random Forest  {'clf__n_estimators': 1000, 'clf__min_samples_...   
3            Born Rule   {'clf__h': 0.25, 'clf__b': 0.25, 'clf__a': 0.25}   

   Train Accuracy  Train Precision  Train Recall  Train F1-score  \
0        0.999750         0.999694      0.999679        0.999686   
1        0.997333         0.995755      0.992626        0.994179   
2        1.000000         1.000000      1.000000        1.000000   
3        0.959500         0.975530      0.761481        0.787904   

   Test Accuracy  Test Precision  Test Recall  Test F1-score  
0       0.987333        0.980691     0.957183       0.968396  
1       0.988333        0.979342     0.955803       0.9




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
results_table

Unnamed: 0,Classifier,Hyperparameters,Train Accuracy,Train Precision,Train Recall,Train F1-score,Test Accuracy,Test Precision,Test Recall,Test F1-score
0,Logistic Regression,"{'clf__solver': 'saga', 'clf__penalty': 'l1', ...",0.99975,0.999694,0.999679,0.999686,0.987333,0.980691,0.957183,0.968396
1,SVM,"{'clf__kernel': 'linear', 'clf__C': 1}",0.997333,0.995755,0.992626,0.994179,0.988333,0.979342,0.955803,0.967044
2,Random Forest,"{'clf__n_estimators': 1000, 'clf__min_samples_...",1.0,1.0,1.0,1.0,0.983333,0.984706,0.91791,0.946056
3,Born Rule,"{'clf__h': 0.25, 'clf__b': 0.25, 'clf__a': 0.25}",0.9595,0.97553,0.761481,0.787904,0.942333,0.962511,0.709813,0.717049
