In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> # Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Import the dataset
wine_df = pd.read_csv('../input/white-wine-quality/winequality-white.csv', sep=';')
wine_df.head()

The description of each features (excluding the output):

*  ***volatile acidity***: Volatile acidity is the gaseous acids present in wine.
*  ***fixed acidity***: Primary fixed acids found in wine are tartaric, succinic, citric, and malic.
*  ***residual sugar***: Amount of sugar left after fermentation.
*  ***citric acid***: It is weak organic acid, found in citrus fruits naturally.
*  ***chlorides***: Amount of salt present in wine.
*  ***free sulfur dioxide***: So2 is used for prevention of wine by oxidation and microbial spoilage.
*  ***total sulfur dioxide***
*  ***pH***: In wine pH is used for checking acidity
*  ***density***
*  ***sulphates***: Added sulfites preserve freshness and protect wine from oxidation, and bacteria.
*  ***alchohol***: Percent of alcohol present in wine. 

In [None]:
wine_df.info()

In [None]:
wine_df.describe().T

In [None]:
wine_df.shape

In [None]:
# separate the features and target variable
feat_df = wine_df.drop(columns=['quality'])
print(feat_df.head())

target = wine_df['quality']
print(target.head())

In [None]:
# Plot distribution of each features
feat_df.hist(bins=20, figsize=(10,20))
plt.show()

In [None]:
sns.kdeplot(x='residual sugar', fill=True, log_scale=True ,data=feat_df)

In [None]:
# Plot the boxplot for each features
plt.figure(figsize=(12,8))
sns.boxplot(data=feat_df, orient='h')

Outliers almost occurs in each features, let's explore further 

In [None]:
sns.boxplot(x='pH', data=feat_df)

In [None]:
sns.boxplot(x='density', data=feat_df)

In [None]:
sns.boxplot(x='chlorides', data=feat_df)

In [None]:
small_df = feat_df[['volatile acidity', 'citric acid', 'sulphates']]

# Plot small range feature 
sns.boxplot(data=small_df, orient='v')
plt.xticks(rotation=45)

Each boxplot shows outliers on each features. Let's see if the outliers are affectiong the correlation

In [None]:
# Plot heatmap for entire dataset
plt.figure(figsize=(10,10))
sns.heatmap(wine_df.corr(), annot=True, fmt='.2f')

From the above heatmaps shows residual sugar and density are positively highly correlated **(0.84)** and alchohol and density are negatively highly corelated **(-0.78)**.

Other features worth mentioning that correlated (either positive or negative) are:
* total sulfur dioxide and free sulfur dioxide **(0.62)**
* density and total sulfur dioxide **(0.53)**
* quality and alcohol **(0.44)**
* fixed axidity and pH **(-0.43)**
* residual sugar and alcohol **(-0.45)**
* total sulfur dioxide and alcohol **(-0.45)**

The result of the heatmap may occur because outliers from every features. Thus, almost every features are weak correlated.

In [None]:
# Pair plot 
sns.pairplot(data=wine_df, hue='quality', palette='tab10', corner=True)

In [None]:
# Plot the target 
sns.histplot(data=target)

In [None]:
wine_df['quality'].unique()

In [None]:
target.value_counts()

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder

In [None]:
# Normalise the features
scaler = RobustScaler()
feat_df_norm = scaler.fit_transform(feat_df)
print(feat_df_norm)

In [None]:
# Change the label of corresponding target variable
encoder = LabelEncoder()
target_enc = encoder.fit_transform(target)
print({index: label for index,label in enumerate(encoder.classes_)})

In [None]:
X = feat_df_norm
y = target_enc
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, stratify=y, random_state=12)

print('Shape of X_train, y_train: {xtrain}, {ytrain}'.format(xtrain=X_train.shape, ytrain=y_train.shape))
print('Shape of X_test, y_test: {xtest}, {ytest}'.format(xtest=X_test.shape, ytest=y_test.shape))

# Reggresion Model

We will train and use cross-validation to get better score overall

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
import tensorflow as tf

from sklearn.model_selection import RandomizedSearchCV

The list of regression model and corresponding score that will be used:
* Linear Regression **(74.535%)**
* KNN **(62.987%)**
* SVM **(69.554%)**
* Decision Tree
* Random Forest
* Ridge Reggresion
* Lasso Reggresion
* Gaussian Reggresion
* Polynomial Reggresion
* Neural Network

In [None]:
def evaluate_model(X_train, y_train, model, splits=3):
    cv = RepeatedStratifiedKFold(n_splits=splits, n_repeats=5, random_state=10)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv, n_jobs=2)
    return scores

In [None]:
def test_model(model, X_train, y_train, X_test, y_test):
    model = model.fit(X_train, y_train)
    yhat = model.predict(X_test)
    
    # Score result
    rmse_res= mean_squared_error(y_test, yhat, squared=False)
    
    print('RMSE Score: ', round(rmse_res*100, 3))
    return None

## Linear Regression

In [None]:
linreg = LinearRegression()
lr_score = evaluate_model(X_train, y_train, model=linreg)
print('Score on train test: ', lr_score)

In [None]:
# Result on test set
test_model(linreg, X_train, y_train, X_test, y_test)

## KNN

In [None]:
param_grid = {'n_neighbors': range(1,51),
              'weights': ['uniform', 'distance']}

knn_rand = RandomizedSearchCV(estimator= KNeighborsRegressor(),
                             param_distributions=param_grid,
                             scoring='neg_mean_squared_error',
                             n_jobs=-1, random_state=20)
knn_rand = knn_rand.fit(X_train, y_train)

In [None]:
print(knn_rand.best_estimator_)

I'll use the weights parameters, for n_neighbors I will try to run it manually

In [None]:
#Try all n_neigbors parameter from 1-50
knn_res = []
for n in range(1,51):
    knn = KNeighborsRegressor(n_neighbors=n, weights='distance')
    
    knn_scores = evaluate_model(X_train, y_train, model=knn)
    knn_res.append(np.mean(knn_scores)*100)
    print('>%d %.3f (%.3f)' % (n, np.mean(knn_scores)*100, np.std(knn_scores)))
print("Skor validasi tertinggi: ", round(max(knn_res),3))

In [None]:
# plot knn validation
k_range = [i for i in range(len(knn_res))]
plt.plot(k_range, knn_res, color='green');
plt.title('k-NN Learning Curves')
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
# Fit the best KNN
knn = KNeighborsRegressor(n_neighbors=13, weights='distance')
knn_score = evaluate_model(X_train, y_train, model=knn)
print('Score on train test: ', knn_score)

In [None]:
test_model(knn, X_train, y_train, X_test, y_test)

## SVM Regressor

In [None]:
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto']}

svr_rand = RandomizedSearchCV(estimator= SVR(),
                             param_distributions=param_grid,
                             scoring='neg_mean_squared_error',
                             n_jobs=-1, random_state=20)
svr_rand = svr_rand.fit(X_train, y_train)

In [None]:
print(svr_rand.best_estimator_)

In [None]:
# Fit the best SVR
svr = SVR()
svr_score = evaluate_model(X_train, y_train, model=svr)
print('Score on train test: ', svr_score)

In [None]:
test_model(svr, X_train, y_train, X_test, y_test)

## Decision Tree Regressor

In [None]:
param_grid = {'criterion': ["squared_error", "friedman_mse", "poisson"],
              'max_depth': range(1,11)}

svr_rand = RandomizedSearchCV(estimator= DecisionTreeRegressor(random_state=12),
                             param_distributions=param_grid,
                             scoring='neg_mean_squared_error',
                             n_jobs=-1, random_state=20)
svr_rand = svr_rand.fit(X_train, y_train)