# prepare dataset

In [1]:
# load libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 

In [None]:
# ======================================== prepare dataset
red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep= ';')
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep= ';')

red_wine.astype('int64').dtypes
white_wine.astype('int64').dtypes

## create a new variable 'wine_type'
red_wine['wine_type'] = 'red'

# bucket wine quality scores into qualitative quality labels
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')
red_wine

red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])
red_wine['quality_label'].dtype

# create a new variable 'wine_type'
white_wine['wine_type'] = 'white'

# bucket wine quality scores into qualitative quality labels

white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

white_wine

# combine two dataframe (red_wine) & (white_wine)
wines = pd.concat([red_wine, white_wine])

# re-shuffle records just to randomize data points
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)
wines

# preprocessing - IV/DV, data split, feature scaling (feature selection)

In [12]:
# preprocessing - categorical DV : string to numeric 
from sklearn.preprocessing import LabelEncoder

X = wines[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]

# label encoder: https://www.mygreatlearning.com/blog/label-encoding-in-python/#labelencodingusingpython 
labelencoder = LabelEncoder()
wines['wine_type_N'] = labelencoder.fit_transform(wines['wine_type']) #red: 0, white: 1
wines['wine_quality_N'] = labelencoder.fit_transform(wines['quality_label']) #high:0, low:1, medium:2 

In [28]:
# =======preprocessing : prepare dataset for classification
w_features = wines.iloc[:,:-5]
w_feature_names = w_features.columns

# set IV and DV
X = w_features
y = wines['quality_label']
#y = w_features['wine_type']

# split dataset for predicting wine quality
wq_train_X, wq_test_X, wq_train_y, wq_test_y = train_test_split(X, y, test_size=0.3, random_state=42)
print(Counter(wq_train_y), Counter(wq_test_y))
print('Features:', list(w_feature_names))

Counter({'medium': 2737, 'low': 1666, 'high': 144}) Counter({'medium': 1178, 'low': 718, 'high': 54})
Features: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [None]:
# standardization
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#scaler = MinMaxScaler()
sc = StandardScaler()
X = np.array(X)

sc.fit_transform(wq_train_X)
sc.transform(wq_test_X)
# why different transformation for train/test set? https://datascience.stackexchange.com/questions/12321/whats-the-difference-between-fit-and-fit-transform-in-scikit-learn-models

In [48]:
#select top10 features - option2: randomforest 
#Random Forests and decision trees, in general, give preference to features with high cardinality ( Trees are biased to these type of variables ).
#Correlated features will be given equal or similar importance, but overall reduced importance compared to the same tree built without correlated counterparts.
#https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(wq_train_X, wq_train_y)

sel.get_support()
selected_feat= wq_train_X.columns[(sel.get_support())]
print(selected_feat) #['volatile acidity', 'density', 'alcohol']

Index(['volatile acidity', 'density', 'alcohol'], dtype='object')


In [None]:
#extract top 10 best features for predicting wine quality - option1:SelectKBest class
# source: https://thecleverprogrammer.com/2020/06/30/feature-selection-techniques-in-machine-learning-with-python/
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))

# top5: total sulfur dioxide, free sulfur dioxide, alcohol, volatile acidity, residual sugar

#output
#Features       Score
#6   total sulfur dioxide  431.709881
#5    free sulfur dioxide  212.255709
#10               alcohol  151.229253
#1       volatile acidity   37.105396
#3         residual sugar   28.531646
#0          fixed acidity    9.439111
#4              chlorides    5.110229
#2            citric acid    2.500478
#9              sulphates    0.572889
#8                     pH    0.019861

# train and predict a model - LogisticRegression

In [37]:
# define parameters of the logisitic regression model 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)

# =======train a model : fit the Logistic Regression classifier
wq_lr = LogisticRegression()
wq_lr.fit(wq_train_X, wq_train_y)
wq_lr.score(wq_test_X, wq_test_y)

# =======predict the wine type and evaluate the performance
wq_lr_predictions = wq_lr.predict(wq_test_X)
#print(classification_report(wq_test_y,wq_lr_predictions, target_names=['red', 'white']))
print(classification_report(wq_test_y,wq_lr_predictions))

#from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#confusion_matrix(y_test, ypred)
#classification_report(y_test, ypred)

              precision    recall  f1-score   support

        high       0.00      0.00      0.00        54
         low       0.62      0.56      0.59       718
      medium       0.72      0.79      0.75      1178

    accuracy                           0.68      1950
   macro avg       0.44      0.45      0.45      1950
weighted avg       0.66      0.68      0.67      1950



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [54]:
accuracy_score(wq_test_y,wq_lr_predictions) #0.6815384615384615

0.6815384615384615

# train and preidct a model - random forest

In [51]:
#Train the random forests model
#wq_train_X, wq_test_X, wq_train_y, wq_test_y
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier()
random_forest_model.fit(wq_train_X, wq_train_y)
wq_rf_predictions = random_forest_model.predict(wq_test_X)

#Measure the performance of the random forest model

print(classification_report(wq_test_y, wq_rf_predictions))
print(confusion_matrix(wq_test_y, wq_rf_predictions))

              precision    recall  f1-score   support

        high       0.94      0.28      0.43        54
         low       0.78      0.71      0.75       718
      medium       0.81      0.88      0.84      1178

    accuracy                           0.80      1950
   macro avg       0.84      0.62      0.67      1950
weighted avg       0.80      0.80      0.80      1950

[[  15    2   37]
 [   0  513  205]
 [   1  144 1033]]


In [55]:
accuracy_score(wq_test_y,wq_rf_predictions) #0.8005128205128205

0.8005128205128205

# compare algorithms based on accuracy

In [57]:
# Compare Algorithms
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


In [58]:
# load dataset
X = X
Y = y 

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
#models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 20)))
models.append(('RF', RandomForestClassifier()))
models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [None]:
# evaluate each model in turn - wine_quality
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
	cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
 
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

#Output
#LR: 0.679856 (0.015070)
#KNN: 0.647381 (0.017921)
#RF: 0.820534 (0.011657)
#CART: 0.749419 (0.017380)
#SVM: 0.603201 (0.021367)
#> RF is the winner

# regression - using k-nearest neighbors model

In [None]:
# code source: https://www.datatechnotes.com/2019/04/regression-example-with-k-nearest.html

# prepare dataset

In [64]:
#Train the knn model
from sklearn.metrics import mean_squared_error 
from sklearn.neighbors import KNeighborsRegressor

K = 8 
wq_knn_reg = KNeighborsRegressor(n_neighbors=K)
print(wq_knn_reg)

KNeighborsRegressor(algorithm='auto', leaf_size=30, 
          metric_params=None, n_jobs=1, n_neighbors=8, p=2,
          weights='uniform') 

wq_knn_reg.fit(wq_train_X, wq_train_y)


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                    weights='uniform')


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                    weights='uniform')

In [None]:
# =======predict the wine type and evaluate the performance
wq_knn_reg_pred = wq_knn_reg.predict(wq_test_X)

#i don't know how to solve this problem and have no time anymore today

# regression - using random forest model

In [None]:
# source: https://www.datatechnotes.com/2020/09/regression-example-with-randomforestregressor.html
