In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# read data, remove unused column, drop rows with missing values
data = pd.read_csv("data/wine_quality_classification.csv")
data.dropna(inplace=True)
data

Unnamed: 0,fixed_acidity,residual_sugar,alcohol,density,quality_label
0,9.3,6.4,13.6,1.0005,high
1,11.2,2.0,14.0,0.9912,medium
2,11.6,0.9,8.2,0.9935,low
3,12.9,6.6,12.7,1.0002,low
4,13.9,13.8,10.4,0.9942,medium
...,...,...,...,...,...
995,13.0,4.3,13.1,1.0035,low
996,6.4,5.3,9.5,1.0040,low
997,4.5,9.0,13.8,0.9931,medium
998,5.3,12.2,8.1,0.9955,low


In [4]:
# more feature engineering - from anchors
# density doesn't really matter
# discreteize fixed acidity and residual sugar into bins

data["fixed_acidity"] = pd.cut(data["fixed_acidity"], bins=10, labels=False)
data["residual_sugar"] = pd.cut(data["residual_sugar"], bins=3, labels=False)
data["alcohol"] = pd.cut(data["alcohol"], bins=3, labels=False)
data["density"] = pd.cut(data["density"], bins=2, labels=False)


In [5]:
def factorize_objs(df: pd.DataFrame):
    for colname, dtype in zip(df.keys(), df.dtypes.to_list()):
        if dtype == np.dtypes.ObjectDType:
            vals, keys = pd.factorize(df[colname])
            df[colname] = vals
            print(keys)

factorize_objs(data)
data

Index(['high', 'medium', 'low'], dtype='object')


Unnamed: 0,fixed_acidity,residual_sugar,alcohol,density,quality_label
0,4,1,2,1,0
1,5,0,2,0,1
2,6,0,0,0,2
3,7,1,2,1,2
4,8,2,1,0,1
...,...,...,...,...,...
995,7,0,2,1,2
996,1,0,0,1,2
997,0,1,2,0,1
998,1,2,0,0,2


In [6]:
target = "quality_label"
features = data.keys().to_list()
features.remove("quality_label")
# features.remove("density")
# features.remove("alcohol")

X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], train_size=0.8, random_state=42)

In [7]:
from sklearn.svm import SVC

model = SVC(class_weight='balanced', decision_function_shape='ovo', gamma='auto', random_state=42)

model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df.to_latex())

\begin{tabular}{lrrrr}
\toprule
 & precision & recall & f1-score & support \\
\midrule
0 & 0.405063 & 0.450704 & 0.426667 & 71.000000 \\
1 & 0.418919 & 0.462687 & 0.439716 & 67.000000 \\
2 & 0.382979 & 0.290323 & 0.330275 & 62.000000 \\
accuracy & 0.405000 & 0.405000 & 0.405000 & 0.405000 \\
macro avg & 0.402320 & 0.401238 & 0.398886 & 200.000000 \\
weighted avg & 0.402859 & 0.405000 & 0.401157 & 200.000000 \\
\bottomrule
\end{tabular}



In [9]:
import dill
dill.settings["recurse"] = True

test = X_test.join(y_test)
data.to_csv("clean_data/wine_test_p2.csv", index=False)

with open("models/wine_svc_p2.modelfile", 'wb') as f:
    dill.dump(model, f)