# (0) Setup

In [1]:
# Colab

# !pip install -U rapidsai
# import cudf


In [2]:
!git clone https://github.com/kkonstantin182/wine-classifier.git
%cd wine-classifier
!pip install -r requirements.txt

Cloning into 'wine-classifier'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 98 (delta 49), reused 49 (delta 15), pack-reused 0[K
Unpacking objects: 100% (98/98), 2.32 MiB | 3.18 MiB/s, done.
/content/wine-classifier
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode==1.3.6
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bornrule==0.1.3
  Downloading bornrule-0.1.3-py3-none-any.whl (27 kB)
Collecting scikit_learn==1.2.1
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy==3.

In [3]:
!python -m spacy download it_core_news_sm

2023-05-07 13:18:11.017746: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')


In [4]:
import sys
from notebooks.path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

from src.text_processing import TextProcessing
from src.dataset import Dataset
from src.constants import SEED


In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
from bornrule import BornClassifier
import multiprocessing as mp
from tqdm import tqdm

In [6]:
# data_path = get_project_root() / "data" / "dataset1_proc.csv"

# TARGET_MAP = {
#     'Rosato': 0, 
#     'Frizzante': 1, 
#     'Bianco': 2, 
#     'Rosso': 3
# }

# COLUMNS = {
#     'target': ['type'],
#     'text': ['review', 'winery', 'variety'],
#     'numerical': ['price'],
#     'categorical': ['appellation2']
# }

# ds_obj = Dataset(data_path, target_map=TARGET_MAP, columns_names=COLUMNS)

# train_set, test_set = ds_obj()

# tp_obj_clean = TextProcessing(is_lemmatized=True)

# with mp.Pool(mp.cpu_count()) as pool:
#     train_set['text'] = pool.map(tp_obj_clean, train_set['text'])

# with mp.Pool(mp.cpu_count()) as pool:
#     test_set['text'] = pool.map(tp_obj_clean, test_set['text'])

In [12]:
COLUMNS = {
    'target': ['type'],
    'text': ['review', 'winery', 'variety'],
    'numerical': ['price'],
    'categorical': ['appellation2']
}

In [7]:
tp_obj_clean = TextProcessing(is_lemmatized=True)


In [10]:
train_set = pd.read_csv(get_project_root() / "data" / "dataset1_train_text_cleaned.csv")
test_set = pd.read_csv(get_project_root() / "data" / "dataset1_test_text_cleaned.csv")

In [11]:
print(train_set.shape)
print(test_set.shape)


(12000, 5)
(3000, 5)


# (1) Experiments

In [35]:
X_train, X_test = train_set.drop("target", axis=1), test_set.drop("target", axis=1)
y_train, y_test = train_set['target'], test_set['target']

In [43]:
# Define preprocessor for the pipeline
# tfidf_transformer = TfidfVectorizer(tokenizer=tp_obj_clean.tokenize)
tfidf_transformer = TfidfVectorizer()
onehot_transformer = OneHotEncoder()
num_transformer = MinMaxScaler()
# num_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('tfidf', tfidf_transformer, 'text'),
    ('onehot', onehot_transformer, COLUMNS['categorical']),
    ('num', num_transformer, COLUMNS['numerical'])
])

In [44]:
# Define pipelines for classifiers
pipelines = [
    # ('Logistic Regression', Pipeline([
    #     ('preprocessor', preprocessor),
    #     ('clf', LogisticRegression(random_state=SEED))
    # ])),

    # ('SVM', Pipeline([
    #     ('preprocessor', preprocessor),
    #     ('clf', SVC(random_state=SEED))
    # ])),

    # ('Random Forest', Pipeline([
    #     ('preprocessor', preprocessor),
    #     ('clf', RandomForestClassifier(random_state=SEED))
    # ])),

    ('Born Rule', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', BornClassifier())
    ]))




]

# Define hyperparameters for grid search
hyperparameters = {
    # 'Logistic Regression': {
    #     'clf__solver': ['saga'],
    #     'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    #     'clf__C': [0.1, 1, 10],
    # },

    # 'SVM': {
    #     'clf__C': [0.01, 0.1, 1, 10, 100],
    #     'clf__kernel': ['linear', 'rbf']
    # },
    
    # 'Random Forest': {
    #     'clf__n_estimators': [10, 100, 1000],
    #     'clf__max_depth': [10, 100, None],
    #     'clf__bootstrap': [True, False],
    #     'clf__min_samples_split': [2, 10, 100],
    # },

    'Born Rule': {
        # 'clf__a': [0.25, 0.5, 1.0, 4.0], # Cannot be 0
        # 'clf__b': [.0, 0.25, 0.5, 1.0, 4.0],
        # 'clf__h': [.0, 0.25, 0.5, 1.0, 4.0],
        'clf__a': [1.0, 4.0], # Cannot be 0
        'clf__b': [.0, 0.25]
    }
}

In [45]:
# Define table to store results
results_table = pd.DataFrame(columns=[
    'Classifier', 
    'Hyperparameters', 
    'Accuracy', 
    'Precision', 
    'Recall', 
    'F1-score'])

# Train and evaluate models
for clf_name, pipeline in tqdm(pipelines, desc="Classifiers", total=len(pipelines)):
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on train data
    y_pred = rs.predict(X_train)
    
    # Compute metrics
    accuracy = accuracy_score(y_train, 
                              y_pred)
    precision,recall,fscore,support=precision_recall_fscore_support(y_train, 
                                                                    y_pred,
                                                                    average='macro')

    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': fscore
    }, ignore_index=True)
    
    # Print progress bar
    remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
    print(f"{remaining_iters} iterations left")
    print("---------------------------------------------------------")

    # print(report)
    
# Print final results table
print("\nResults table:")
print(results_table)
results_table.to_csv('class_cls_hp_results.csv')

Classifiers:   0%|          | 0/1 [00:00<?, ?it/s]

Training Born Rule


  results_table = results_table.append({
Classifiers: 100%|██████████| 1/1 [00:10<00:00, 10.63s/it]

0 iterations left
---------------------------------------------------------

Results table:
  Classifier                  Hyperparameters  Accuracy  Precision   Recall  \
0  Born Rule  {'clf__b': 0.25, 'clf__a': 1.0}   0.93125   0.962923  0.79859   

   F1-score  
0   0.85236  





In [24]:
results_table 

Unnamed: 0,Classifier,Hyperparameters,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,"{'clf__solver': 'saga', 'clf__C': 1}",0.981917,0.981469,0.926994,0.950991
