In [2]:
import pandas as pd
df = pd.read_csv("winequality-white.csv",sep=';')

In [3]:
# import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [14]:
# define the dictionary of models our script can use
# the key to the dictionary is the name of the model
# (supplied via command line argument) and the value is the model itself
models = {
"knn": KNeighborsClassifier(n_neighbors=1),
"naive_bayes": GaussianNB(),
"logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
"svm": SVC(kernel="rbf", gamma="auto"),
"decision_tree": DecisionTreeClassifier(),
"random_forest": RandomForestClassifier(n_estimators=100),
}

In [39]:
# load the white wine dataset and perform a training and testing split
# using 75% of the data for training and 25% for evaluation
print("[INFO] loading data...")
(trainX, testX, trainY, testY) = train_test_split(df.iloc[:,0:11],
df['quality'], random_state=3, test_size=0.25)
# train the modelprint("[INFO] using '{}' model".format(model_name))
model = models['random_forest']
model.fit(trainX, trainY)
# make predictions on our data and show a classification report
print("[INFO] evaluating...")
predictions = model.predict(testX)
print(classification_report(testY, predictions))

[INFO] loading data...
[INFO] evaluating...
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         7
           4       0.54      0.22      0.31        32
           5       0.67      0.67      0.67       353
           6       0.64      0.78      0.70       566
           7       0.69      0.51      0.59       213
           8       0.84      0.31      0.45        52
           9       0.00      0.00      0.00         2

    accuracy                           0.66      1225
   macro avg       0.48      0.35      0.39      1225
weighted avg       0.66      0.66      0.65      1225



  'precision', 'predicted', average, warn_for)


In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
x = df.iloc[:,0:11]
# normalizing: mean=0; std=1
x = StandardScaler().fit_transform(x)
pca = PCA(n_components=2)
pC_df = pca.fit_transform(x)
principal_df = pd.DataFrame(data = pC_df,
columns = ['PC1', 'PC2'])
print('Explained variation per principal component:{}'.format(pca.explained_variance_ratio_))

Explained variation per principal component:[0.29293217 0.14320363]


In [38]:
#memisahkan data training dan data test untuk hasil PCA
(trainX, testX, trainY, testY) = train_test_split(principal_df,
df['quality'], random_state=3, test_size=0.25)
# train the modelprint("[INFO] using '{}' model".format(model_name))
model = models['random_forest']
model.fit(trainX, trainY)
# make predictions on our PCA data and show a classification report
print("[INFO] evaluating...")
predictions = model.predict(testX)
print(classification_report(testY, predictions))

[INFO] evaluating...
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         7
           4       0.20      0.09      0.13        32
           5       0.55      0.56      0.56       353
           6       0.60      0.67      0.63       566
           7       0.50      0.46      0.48       213
           8       0.72      0.35      0.47        52
           9       0.00      0.00      0.00         2

    accuracy                           0.57      1225
   macro avg       0.37      0.30      0.32      1225
weighted avg       0.56      0.57      0.56      1225



  'precision', 'predicted', average, warn_for)


In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
x = df.iloc[:,0:11]
x = StandardScaler().fit_transform(x) # normalizing the features:
mean=0; std=1
y=df['quality']
lda = LinearDiscriminantAnalysis(n_components=2)
np_lda=lda.fit(x,y).transform(x)
df_lda=pd.DataFrame(np_lda)
print('Explained variation per principal component:{}'.format(lda.explained_variance_ratio_))

Explained variation per principal component:[0.83120809 0.11348134]


In [37]:
#memisahkan data training dan data test untuk hasil LDA
(trainX, testX, trainY, testY) = train_test_split(df_lda,
df['quality'], random_state=3, test_size=0.25)

# train the modelprint("[INFO] using '{}' model".format(model_name))
model = models['random_forest']
model.fit(trainX, trainY)

# make predictions on our LDA data and show a classification report
print("[INFO] evaluating...")
predictions = model.predict(testX)
print(classification_report(testY, predictions))

[INFO] evaluating...
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         7
           4       0.26      0.16      0.20        32
           5       0.62      0.60      0.61       353
           6       0.62      0.67      0.65       566
           7       0.48      0.48      0.48       213
           8       0.67      0.38      0.49        52
           9       0.00      0.00      0.00         2

    accuracy                           0.59      1225
   macro avg       0.38      0.33      0.35      1225
weighted avg       0.58      0.59      0.58      1225

