# (0) Setup

In [12]:
# Colab

# !pip install -U rapidsai
# import cudf


In [13]:
!git clone https://github.com/kkonstantin182/wine-classifier.git
%cd wine-classifier
!pip install -r requirements.txt

Cloning into 'wine-classifier'...
remote: Enumerating objects: 117, done.[K
remote: Counting objects: 100% (117/117), done.[K
remote: Compressing objects: 100% (93/93), done.[K
remote: Total 117 (delta 64), reused 55 (delta 19), pack-reused 0[K
Receiving objects: 100% (117/117), 2.49 MiB | 16.01 MiB/s, done.
Resolving deltas: 100% (64/64), done.
/content/wine-classifier/wine-classifier
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:

# import nltk
# nltk.download()

In [15]:
import sys
from notebooks.path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

from src.text_processing import TextProcessing, Vectorization
from src.dataset import Dataset
from src.constants import SEED


[0m

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
from bornrule import BornClassifier
import multiprocessing as mp
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

In [17]:
nltk.download('punkt')
!python -m spacy download it_core_news_sm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')


In [18]:
# data_path = get_project_root() / "data" / "dataset1_proc.csv"

# TARGET_MAP = {
#     'Rosato': 0, 
#     'Frizzante': 1, 
#     'Bianco': 2, 
#     'Rosso': 3
# }

# COLUMNS = {
#     'target': ['type'],
#     'text': ['review', 'winery', 'variety'],
#     'numerical': ['price'],
#     'categorical': ['appellation2']
# }

# ds_obj = Dataset(data_path, target_map=TARGET_MAP, columns_names=COLUMNS)

# train_set, test_set = ds_obj()

# tp_obj_clean = TextProcessing(is_lemmatized=True)

# with mp.Pool(mp.cpu_count()) as pool:
#     train_set['text'] = pool.map(tp_obj_clean, train_set['text'])

# with mp.Pool(mp.cpu_count()) as pool:
#     test_set['text'] = pool.map(tp_obj_clean, test_set['text'])

In [19]:
COLUMNS = {
    'target': ['type'],
    'text': ['review', 'winery', 'variety'],
    'numerical': ['price'],
    'categorical': ['appellation2']
}

In [20]:
tp_obj_clean = TextProcessing(is_lemmatized=True)


In [21]:
train_set = pd.read_csv(get_project_root() / "data" / "dataset1_train_text_cleaned.csv", index_col=False)
test_set = pd.read_csv(get_project_root() / "data" / "dataset1_test_text_cleaned.csv", index_col=False)

In [22]:
print(train_set.shape)
print(test_set.shape)


(12000, 4)
(3000, 4)


# (1) Experiments

In [28]:
X_train, X_test = train_set.drop("target", axis=1), test_set.drop("target", axis=1)
y_train, y_test = train_set['target'], test_set['target']

In [146]:
# Feature trans-n

num_tranfsformer = Pipeline(
    steps=[("scaler", MinMaxScaler()),
           
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        
    ]
)

# For Born Rule
preprocessor1 = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
         ("num", 'passthrough', COLUMNS['numerical']),
        ("cat", cat_transformer, COLUMNS['categorical']),
        
    ]
)

preprocessor2 = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(tokenizer=Vectorization.tokenize_it), "text"),
         ("num", num_tranfsformer, COLUMNS['numerical']),
        ("cat", cat_transformer, COLUMNS['categorical']),
        
    ]
)


In [148]:
# Define pipelines for classifiers
pipelines = [
    ('Logistic Regression', Pipeline([
        ('preprocessor', preprocessor2),
        ('clf', LogisticRegression(random_state=SEED))
    ])),

    ('SVM', Pipeline([
        ('preprocessor', preprocessor2),
        ('clf', SVC(random_state=SEED))
    ])),

    ('Random Forest', Pipeline([
        ('preprocessor', preprocessor2),
        ('clf', RandomForestClassifier(random_state=SEED))
    ])),

    ('Born Rule', Pipeline([
        ('preprocessor', preprocessor1),
        ('clf', BornClassifier())
    ]))

]

# Define hyperparameters for grid search
hyperparameters = {
    'Logistic Regression': {
        'clf__solver': ['saga'],
        'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'clf__C': [0.1, 1, 10],
    },

    'SVM': {
        'clf__C': [0.01, 0.1, 1, 10, 100],
        'clf__kernel': ['linear', 'rbf']
    },
    
    'Random Forest': {
        'clf__n_estimators': [10, 100, 1000],
        'clf__max_depth': [10, 100, None],
        'clf__bootstrap': [True, False],
        'clf__min_samples_split': [2, 10, 100],
    },

    'Born Rule': {
        'clf__a': [0.25, 0.5, 1.0, 4.0], # Cannot be 0
        'clf__b': [.0, 0.25, 0.5, 1.0, 4.0],
        'clf__h': [.0, 0.25, 0.5, 1.0, 4.0],
        
    }
}

In [81]:
# Define table to store results
results_table = pd.DataFrame(columns=[
    'Classifier', 
    'Hyperparameters', 
    'Accuracy', 
    'Precision', 
    'Recall', 
    'F1-score'])

# Train and evaluate models
for clf_name, pipeline in tqdm(pipelines, desc="Classifiers", total=len(pipelines)):
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on train data
    y_pred = rs.predict(X_train)
    
    # Compute metrics
    accuracy = accuracy_score(y_train, 
                              y_pred)
    precision,recall,fscore,support=precision_recall_fscore_support(y_train, 
                                                                    y_pred,
                                                                    average='macro')

    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': fscore
    }, ignore_index=True)
    
    # Print progress bar
    remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
    print(f"{remaining_iters} iterations left")
    print("---------------------------------------------------------")

    # print(report)
    
# Print final results table
print("\nResults table:")
print(results_table)
results_table.to_csv('class_cls_hp_results.csv')

Classifiers:   0%|          | 0/1 [00:00<?, ?it/s]

Training Born Rule


  results_table = results_table.append({
Classifiers: 100%|██████████| 1/1 [00:51<00:00, 51.95s/it]

0 iterations left
---------------------------------------------------------

Results table:
  Classifier                  Hyperparameters  Accuracy  Precision    Recall  \
0  Born Rule  {'clf__b': 0.25, 'clf__a': 4.0}  0.981167   0.966896  0.938009   

   F1-score  
0  0.951584  





In [82]:
results_table 

Unnamed: 0,Classifier,Hyperparameters,Accuracy,Precision,Recall,F1-score
0,Born Rule,"{'clf__b': 0.25, 'clf__a': 4.0}",0.981167,0.966896,0.938009,0.951584


In [83]:
# Set hyperparameters for Born Rule model
pipelines[0][1].set_params(clf__a=1.0, clf__b=0.25)

# Fit the pipeline
pipelines[0][1].fit(X_train, y_train)

# Make predictions on test data
y_pred = pipelines[0][1].predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy
print("Accuracy:", accuracy)



ValueError: ignored

In [120]:
def tfidf_vectorize(train_set, test_set, tokenizer, lowercase=True):
 
        vectorizer = TfidfVectorizer(tokenizer=tokenizer, lowercase=lowercase)
        vectorizer.fit(train_set)
        tokens_train_vec =  vectorizer.transform((train_set))
        tokens_test_vec = vectorizer.transform((test_set))
        features_names = vectorizer.get_feature_names_out()
        return tokens_train_vec, tokens_test_vec, features_names

In [121]:
X_train_tok, X_test_tok, feat = tfidf_vectorize(X_train['text'], X_test['text'], tokenizer=Vectorization.tokenize_it)



In [122]:
bc = BornClassifier()

In [123]:
bc.fit(X_train_tok, y_train)

In [124]:
bc.predict(X_test_tok)

array([3, 2, 3, ..., 3, 3, 3])