In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# read data, remove unused column, drop rows with missing values
data = pd.read_csv("data/wine_quality_classification.csv")
data.dropna(inplace=True)
data

Unnamed: 0,fixed_acidity,residual_sugar,alcohol,density,quality_label
0,9.3,6.4,13.6,1.0005,high
1,11.2,2.0,14.0,0.9912,medium
2,11.6,0.9,8.2,0.9935,low
3,12.9,6.6,12.7,1.0002,low
4,13.9,13.8,10.4,0.9942,medium
...,...,...,...,...,...
995,13.0,4.3,13.1,1.0035,low
996,6.4,5.3,9.5,1.0040,low
997,4.5,9.0,13.8,0.9931,medium
998,5.3,12.2,8.1,0.9955,low


In [3]:
def factorize_objs(df: pd.DataFrame):
    for colname, dtype in zip(df.keys(), df.dtypes.to_list()):
        if dtype == np.dtypes.ObjectDType:
            vals, keys = pd.factorize(df[colname])
            df[colname] = vals
            print(keys)

factorize_objs(data)
data

Index(['high', 'medium', 'low'], dtype='object')


Unnamed: 0,fixed_acidity,residual_sugar,alcohol,density,quality_label
0,9.3,6.4,13.6,1.0005,0
1,11.2,2.0,14.0,0.9912,1
2,11.6,0.9,8.2,0.9935,2
3,12.9,6.6,12.7,1.0002,2
4,13.9,13.8,10.4,0.9942,1
...,...,...,...,...,...
995,13.0,4.3,13.1,1.0035,2
996,6.4,5.3,9.5,1.0040,2
997,4.5,9.0,13.8,0.9931,1
998,5.3,12.2,8.1,0.9955,2


In [4]:
train, test = train_test_split(data, train_size=0.8, random_state=42)

target = "quality_label"
features = data.keys().to_list()
features.remove("quality_label")
features

X_train = train[features]
y_train = train[target]

X_test = train[features]
y_test = train[target]

In [5]:
from sklearn.svm import SVC

model = SVC(class_weight='balanced', decision_function_shape='ovo', gamma='auto', random_state=42)

model.fit(X_train, y_train)

In [6]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.54      0.56       272
           1       0.62      0.58      0.60       288
           2       0.53      0.62      0.57       240

    accuracy                           0.58       800
   macro avg       0.58      0.58      0.58       800
weighted avg       0.58      0.58      0.58       800



In [7]:
import dill
dill.settings["recurse"] = True

test.to_csv("clean_data/wine_test.csv", index=False)

with open("models/wine_svc.modelfile", 'wb') as f:
    dill.dump(model, f)
