In [1]:
import sys
from path_conf import get_project_root
path_src = get_project_root() / "src"
sys.path.append(str(path_src.resolve()))

In [2]:
from text_processing import TextProcessing
from dataset import Dataset

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from joblib import Parallel, delayed

In [4]:
data_path = "E:\OneDrive\Documents\GitHub\wine-classifier\data\dataset1_proc.csv"

In [5]:
TARGET_MAP = {
    'Rosato': 0, 
    'Frizzante': 1, 
    'Bianco': 2, 
    'Rosso': 3
}

COLUMNS = {
    'target': ['type'],
    'text': ['review', 'winery', 'variety'],
    'numerical': ['price'],
    'categorical': ['appellation2']
}

In [6]:
ds_obj = Dataset(data_path, target_map=TARGET_MAP, columns_names=COLUMNS)

<bound method Dataset.print_shape of <dataset.Dataset object at 0x00000290D8EC3D60>>


In [7]:
train_set, test_set = ds_obj()

The train shape: (12000, 4)
The test shape: (3000, 4)


In [12]:
train_set['type']

14109    3
6673     3
11845    3
2738     2
11414    3
        ..
3875     2
14654    3
8081     3
276      0
10863    3
Name: type, Length: 12000, dtype: int64

In [13]:
X_train, X_test = train_set.drop("type", axis=1), test_set.drop("type", axis=1)
y_train, y_test = train_set['type'], test_set['type']

In [14]:
# Define preprocessor for the pipeline
tfidf_transformer = TfidfVectorizer()
onehot_transformer = OneHotEncoder()
num_transformer = StandardScaler()
preprocessor = ColumnTransformer([
    ('tfidf', tfidf_transformer, 'text'),
    ('onehot', onehot_transformer, ['categorical']),
    ('num', num_transformer, ['numerical'])
])

In [None]:


# Define pipelines for classifiers
pipelines = [
    ('Logistic Regression', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', LogisticRegression(random_state=42))
    ])),
    ('SVM', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', SVC(random_state=42))
    ])),
    ('Random Forest', Pipeline([
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier(random_state=42))
    ]))
]

# Define hyperparameters for grid search
hyperparameters = {
    'Logistic Regression': {
        'clf__C': [0.1, 1, 10],
        'clf__penalty': ['l1', 'l2']
    },
    'SVM': {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    },
    'Random Forest': {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [10, 20, None]
    }
}

# Define table to store results
results_table = pd.DataFrame(columns=['Classifier', 'Hyperparameters', 'Accuracy', 'Precision', 'Recall', 'F1-score'])

# Train and evaluate models
for clf_name, pipeline in pipelines:
    print("Training", clf_name)
    clf = pipeline.named_steps['clf']
    hyperparams = hyperparameters[clf_name]
    rs = RandomizedSearchCV(pipeline, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    rs.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = rs.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    precision, recall, f1, _ = map(float, report.split("\n")[-2].split()[1:])
    
    # Store results in table
    results_table = results_table.append({
        'Classifier': clf_name,
        'Hyperparameters': rs.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }, ignore_index=True)