In [None]:
import sys
from path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

from text_processing import TextProcessing
from dataset import Dataset
from constants import SEED


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm
from bornrule import BornClassifier



In [None]:
data_path = get_project_root() / "data" / "dataset1_proc.csv"

In [None]:
TARGET_MAP = {
    'Rosato': 0, 
    'Frizzante': 1, 
    'Bianco': 2, 
    'Rosso': 3
}

COLUMNS = {
    'target': ['type'],
    'text': ['review', 'winery', 'variety'],
    'numerical': ['price'],
    'categorical': ['appellation2']
}

In [None]:
ds_obj = Dataset(data_path, target_map=TARGET_MAP, columns_names=COLUMNS)

In [None]:
train_set, test_set = ds_obj()

In [None]:
X_train, X_test = train_set.drop("type", axis=1), test_set.drop("type", axis=1)
y_train, y_test = train_set['type'], test_set['type']

In [None]:
# Define preprocessor for the pipeline
tfidf_transformer = TfidfVectorizer()
onehot_transformer = OneHotEncoder()
num_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('tfidf', tfidf_transformer, 'text'),
    ('onehot', onehot_transformer, COLUMNS['categorical']),
    ('num', num_transformer, COLUMNS['numerical'])
])


In [None]:
# Define pipelines for classifiers
pipelines = [
    ('Logistic Regression', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', LogisticRegression(random_state=SEED))
    ])),

    ('SVM', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', SVC(random_state=SEED))
    ])),

    ('Random Forest', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier(random_state=SEED))
    ])),
    
    # BornClassifier()

    ('Born Rule', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', BornClassifier())
    ])),
]

# Define hyperparameters for grid search
hyperparameters = {
    'Logistic Regression': {
        'clf__solver': ['saga'],
        'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'clf__C': [0.1, 1, 10, 100],
    },

    'SVM': {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    },

    'Random Forest': {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [10, 20, None]
    },

    'Born Rule': {
        'clf__a': [0.125, 0.25, 0.5, 1.0, 2.0, 8.0], # Cannot be 0
        'clf__b': [.0, 0.125, 0.25, 0.5, 1.0, 2.0, 8.0],
        'clf__h': [.0, 0.125, 0.25, 0.5, 1.0, 2.0, 8.0],
    }
}

# Define table to store results
results_table = pd.DataFrame(columns=['Classifier', 'Hyperparameters', 'Accuracy', 'Precision', 'Recall', 'F1-score'])

# Train and evaluate models
for clf_name, pipeline in pipelines:
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = rs.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    precision, recall, f1, _ = map(float, report.split("\n")[-2].split()[1:])
    
    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }, ignore_index=True)
    
    # Print results for current iteration
    print("\nResults for", clf_name)
    print("Best hyperparameters:", rs.best_params_)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    
    # Print progress bar
    remaining_iters = len(pipelines) - (pipelines.index((clf_name, pipeline)) + 1)
    print(f"{remaining_iters} iterations left")
    print("---------------------------------------------------------")
    
# Print final results table
print("\nResults table:")
print(results_table)

In [None]:
# # Define pipelines for classifiers
# pipelines = [
#     ('Logistic Regression', Pipeline([
#         ('preprocessor', preprocessor),
#         ('clf', LogisticRegression(random_state=SEED))
#     ])),
#     ('SVM', Pipeline([
#         ('preprocessor', preprocessor),
#         ('clf', SVC(random_state=SEED))
#     ])),
#     ('Random Forest', Pipeline([
#         ('preprocessor', preprocessor),
#         ('clf', RandomForestClassifier(random_state=SEED))
#     ]))
# ]

# # Define hyperparameters for grid search
# hyperparameters = {
#     'Logistic Regression': {
#         'clf__solver': ['saga'],
#         'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
#         'clf__C': [0.1, 1, 10, 100],
#     },
#     'SVM': {
#         'clf__C': [0.1, 1, 10],
#         'clf__kernel': ['linear', 'rbf']
#     },
#     'Random Forest': {
#         'clf__n_estimators': [100, 200],
#         'clf__max_depth': [10, 20, None]
#     }
# }

# # Define table to store results
# results_table = pd.DataFrame(columns=['Classifier', 'Hyperparameters', 'Accuracy', 'Precision', 'Recall', 'F1-score'])

# # Train and evaluate models
# for clf_name, pipeline in pipelines:
#     print("Training", clf_name)
#     clf = pipeline.named_steps['clf']
#     hyperparams = hyperparameters[clf_name]
#     rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
#     rs.fit(X_train, y_train)
    
#     # Make predictions on test data
#     y_pred = rs.predict(X_test)
    
#     # Compute metrics
#     accuracy = accuracy_score(y_test, y_pred)
#     report = classification_report(y_test, y_pred, zero_division=0)
#     precision, recall, f1, _ = map(float, report.split("\n")[-2].split()[1:])
    
#     # Store results in table
#     results_table = results_table.append({
#         'Classifier': clf_name,
#         'Hyperparameters': rs.best_params_,
#         'Accuracy': accuracy,
#         'Precision': precision,
#         'Recall': recall,
#         'F1-score': f1
#     }, ignore_index=True)
    
#     # # Print results for current iteration
#     # print("\nResults for", clf_name)
#     # print("Best hyperparameters:", rs.best_params_)
#     # print("Accuracy:", accuracy)
#     # print("Precision:", precision)
#     # print("Recall:", recall)
#     # print("F1-score:", f1)
    
# # Print final results table
# print("\nResults table:")
# print(results_table)