# Polynomial Features

> ## Library

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

> ## Data

In [4]:
wine = pd.read_csv(r'C:\Users\user\Documents\Data Science\MODUL 3\Scaling\white_wine.csv')
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
515,6.1,0.31,0.26,2.2,0.051,28.0,167.0,0.9926,3.37,0.47,10.4,6.0
516,6.8,0.18,0.37,1.6,0.055,47.0,154.0,0.9934,3.08,0.45,9.1,5.0
517,7.4,0.15,0.42,1.7,0.045,49.0,154.0,0.9920,3.00,0.60,10.4,6.0
518,5.9,0.13,0.28,1.9,0.050,20.0,78.0,0.9918,3.43,0.64,10.8,6.0


In [8]:
wine['label'] = np.where(wine['quality']>6, 1, 0) # Create new column
wine[['alcohol', 'density']].isnull().sum() # Check missing value

alcohol    1
density    0
dtype: int64

In [10]:
wine['alcohol'].fillna(0, inplace=True)
wine[['alcohol', 'density']].isnull().sum()

alcohol    0
density    0
dtype: int64

> ## Data Splitting

In [11]:
x = wine[['alcohol', 'density']]
y = wine['label']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x , y, stratify = y, random_state = 2020)

> ## Model without Polynomial

In [14]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

LogisticRegression()

In [15]:
y_pred = logreg.predict(x_test)
accuracy_score(y_test, y_pred)

0.8538461538461538

> ## Model with Polynomial

X0 = alcohol    
X1 = density

In [22]:
x_train

Unnamed: 0,alcohol,density
428,9.1,0.9936
229,8.7,0.9998
243,9.7,0.9934
176,12.6,0.9910
368,10.6,0.9931
...,...,...
367,9.4,0.9927
25,10.4,0.9955
273,9.0,0.9949
453,10.5,0.9974


In [23]:
polinom = PolynomialFeatures(degree = 3, include_bias = False, interaction_only = False)
x_train_poll = polinom.fit_transform(x_train)
x_test_poll = polinom.transform(x_test)

In [24]:
pd.DataFrame(x_train_poll, columns = polinom.get_feature_names())

Unnamed: 0,x0,x1,x0^2,x0 x1,x1^2,x0^3,x0^2 x1,x0 x1^2,x1^3
0,9.1,0.9936,82.81,9.04176,0.987241,753.571,82.280016,8.983893,0.980923
1,8.7,0.9998,75.69,8.69826,0.999600,658.503,75.674862,8.696520,0.999400
2,9.7,0.9934,94.09,9.63598,0.986844,912.673,93.469006,9.572383,0.980330
3,12.6,0.9910,158.76,12.48660,0.982081,2000.376,157.331160,12.374221,0.973242
4,10.6,0.9931,112.36,10.52686,0.986248,1191.016,111.584716,10.454225,0.979443
...,...,...,...,...,...,...,...,...,...
385,9.4,0.9927,88.36,9.33138,0.985453,830.584,87.714972,9.263261,0.978259
386,10.4,0.9955,108.16,10.35320,0.991020,1124.864,107.673280,10.306611,0.986561
387,9.0,0.9949,81.00,8.95410,0.989826,729.000,80.586900,8.908434,0.984778
388,10.5,0.9974,110.25,10.47270,0.994807,1157.625,109.963350,10.445471,0.992220


In [26]:
logreg = LogisticRegression()
logreg.fit(x_train_poll, y_train)

LogisticRegression()

In [27]:
y_pred = logreg.predict(x_test_poll)
accuracy_score(y_test, y_pred)

0.9769230769230769

There is a significant increased : from 85.3 % --> 97.69 %