In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [7]:
# Loading data in df and removing target variable quality and loading in y
# This is the same dataset used for Decision Tree

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 11 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
dtypes: float64(11)
memory usage: 421.0 KB


In [6]:
df.isnull().sum().sum()

0

### Splitting into Train and Test

In [10]:
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

### Logistic regression on the unscaled train data  

In [11]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score for Logistic Regression:', accuracy_score(y_test, y_pred))

Accuracy score for Logistic Regression: 0.5142857142857142


## Function to parameterize the Random Forest tree hyper-parameters 

### Run in loop for different values to identify the best value for hyperparameters and use that value in the final model 

In [12]:
def fit_predict(train, test, y_train, y_test,  max_depth = None , 
                n_estimators = 10, max_features = 'auto', min_samples_split = 2,scaler = None):
    if scaler:
        train = scaler.fit_transform(train)
        test = scaler.fit_transform(test)        
    RF = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth, 
                                random_state = 42, max_features = max_features,
                               min_samples_split = min_samples_split)
    RF.fit(train, y_train)
    y_pred = RF.predict(test)
    print(accuracy_score(y_test, y_pred))

In [17]:
print('baseline accuracy score: ',end= ": ")
fit_predict(train,test,y_train,y_test)
print('baseline accuracy score with scaler', end = ': ')
fit_predict(train,test,y_train,y_test,scaler=StandardScaler())

baseline accuracy score: : 0.6428571428571429
baseline accuracy score with scaler: 0.6183673469387755


### Run in loop for different hyper parameters

In [15]:
for n_estimators in range(20,100,10):
    print('Accuracy score using n_estimators =', n_estimators, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = n_estimators)


Accuracy score using n_estimators = 20: 0.6591836734693878
Accuracy score using n_estimators = 30: 0.6663265306122449
Accuracy score using n_estimators = 40: 0.6744897959183673
Accuracy score using n_estimators = 50: 0.6816326530612244
Accuracy score using n_estimators = 60: 0.6816326530612244
Accuracy score using n_estimators = 70: 0.6846938775510204
Accuracy score using n_estimators = 80: 0.6877551020408164
Accuracy score using n_estimators = 90: 0.686734693877551


In [16]:
for max_depth in range(1,10):
    print('Accuracy score using max_depth =', max_depth, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_depth = max_depth)


Accuracy score using max_depth = 1: 0.44081632653061226
Accuracy score using max_depth = 2: 0.4897959183673469
Accuracy score using max_depth = 3: 0.49387755102040815
Accuracy score using max_depth = 4: 0.5051020408163265
Accuracy score using max_depth = 5: 0.5244897959183673
Accuracy score using max_depth = 6: 0.5357142857142857
Accuracy score using max_depth = 7: 0.563265306122449
Accuracy score using max_depth = 8: 0.5826530612244898
Accuracy score using max_depth = 9: 0.5959183673469388


In [8]:
for max_features in np.linspace(0.1,1,10):
    print('Accuracy score using max_features =', max_features, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = max_features,max_depth = 18)


Accuracy score using max_features = 0.1: 0.6969387755102041
Accuracy score using max_features = 0.2: 0.7040816326530612
Accuracy score using max_features = 0.30000000000000004: 0.7020408163265306
Accuracy score using max_features = 0.4: 0.6948979591836735
Accuracy score using max_features = 0.5: 0.6969387755102041
Accuracy score using max_features = 0.6: 0.6908163265306122
Accuracy score using max_features = 0.7000000000000001: 0.6969387755102041
Accuracy score using max_features = 0.8: 0.6989795918367347
Accuracy score using max_features = 0.9: 0.6918367346938775
Accuracy score using max_features = 1.0: 0.7020408163265306


In [9]:
for min_samples_split in range(2,10):
    print('Accuracy score using min_samples_split =', min_samples_split, end = ': ')
    fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=min_samples_split
               ,max_depth = 18)


Accuracy score using min_samples_split = 2: 0.7040816326530612
Accuracy score using min_samples_split = 3: 0.7193877551020408
Accuracy score using min_samples_split = 4: 0.7040816326530612
Accuracy score using min_samples_split = 5: 0.6938775510204082
Accuracy score using min_samples_split = 6: 0.6938775510204082
Accuracy score using min_samples_split = 7: 0.6857142857142857
Accuracy score using min_samples_split = 8: 0.6806122448979591
Accuracy score using min_samples_split = 9: 0.6714285714285714


In [11]:
print('tuned accuracy score', end = ': ')
fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=3,max_depth = 18)
print('tuned accuracy score with scaler', end = ': ')

fit_predict(train,test,y_train,y_test,n_estimators = 160,max_features = 0.2,min_samples_split=3,
            max_depth = 18,scaler=StandardScaler())

tuned accuracy score: 0.6806122448979591
tuned accuracy score with scaler: 0.6806122448979591


### Compare the new accuracy with the original value 

#### NOTE: These accuracy values will be slightly different in your run based on data samples 

In [11]:
original_score = 0.514285714286
best_score = 0.7193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 39.88 %


In [12]:
original_score = 0.6428571428571429
best_score = 0.7193877551020408
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement compare to non tuned model is {} %'.format(improvement))

overall improvement compare to non tuned model is 11.9 %
