In [101]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest,f_classif, chi2
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot
from collections import Counter
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso


# metrics evaluation
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import cohen_kappa_score
import statsmodels.api as sm
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# To standardise data
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler

# For train/test split
from sklearn.model_selection import train_test_split


from sklearn.feature_selection import VarianceThreshold as vt
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [102]:
#import CSVs, assign headers and remove quotations from column headers
white_wine = pd.read_csv(r'C:\Users\Killian\Projects\Wine analysis\Wine data\winequality-white.csv', sep=';',header=0, engine='python')

In [103]:
white_wine['label'] = white_wine['quality'].apply(lambda value: 'low' if value <= 5 else 'medium' if value <=6 else 'high')
white_wine.head()
white_wine["label"].value_counts()

medium    2198
low       1640
high      1060
Name: label, dtype: int64

In [104]:
white_wine = pd.get_dummies(white_wine)

#print(y)

#white_wine['category'] = white_wine.label.map({"low" : "0", "medium" : "1", "high" : "2"}).astype('int')
#white_wine= white_wine.drop(columns=["quality", "label"])
#print(white_wine.category.value_counts())
#white_wine.category.dtypes
#As seen from previous results, Medium = 2, Low = 1, High = 0.
#Can't use OrdinalEncoder as it is 1D data. Ordinal Encoder is used on 2D array. 

In [105]:
white_wine.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,label_high,label_low,label_medium
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6,0,0,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5,0,1,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0,0,1
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1,0,0
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,0,0,1


In [106]:
min_max_scaler = MinMaxScaler()
white_wine = white_wine.drop(columns=['citric acid', 'total sulfur dioxide', 'density', 'pH', 'quality'])
white_wine[white_wine.columns] = min_max_scaler.fit_transform(white_wine[white_wine.columns])
white_wine = pd.concat([white_wine],axis=1, sort=False)
white_wine.tail()

Unnamed: 0,fixed acidity,volatile acidity,residual sugar,chlorides,free sulfur dioxide,sulphates,alcohol,label_high,label_low,label_medium
4893,0.230769,0.127451,0.015337,0.089021,0.076655,0.325581,0.516129,0.0,0.0,1.0
4894,0.269231,0.235294,0.113497,0.11276,0.191638,0.27907,0.258065,0.0,1.0,0.0
4895,0.259615,0.156863,0.009202,0.094955,0.097561,0.27907,0.225806,0.0,0.0,1.0
4896,0.163462,0.205882,0.007669,0.038576,0.062718,0.186047,0.774194,1.0,0.0,0.0
4897,0.211538,0.127451,0.003067,0.032641,0.069686,0.116279,0.612903,0.0,0.0,1.0


In [110]:
#Create target and features for algorithm
#Full feature list for ease of access while doing Backward Eliination and refining model: 'fixed acidity', 'volatile acidity', 'residual sugar', 'density', 'pH', 
# 'sulphates','citric acid', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'alcohol'
X = white_wine.copy()
y = X.columns['label_high', 'label_low', 'label_medium'] #target
X =white_wine.drop(columns=['label_high', 'label_low', 'label_medium'],axis=1,inplace=True) # features we want to test, except target and correlated values, i.e. Quality and Label. 
# Final list will include features removed due to results from Backward Elimination.
X_list = list(X.columns)
print(X.columns.tolist())
print(y)

TypeError: 'Index' object is not callable

In [111]:
sns.countplot(x='category', data=white_wine, palette = 'hls')
plt.show()

#reasonably balanced data. No need to apply SMOTE. 

ValueError: Could not interpret input 'category'

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #, stratify=y) # random state controls the shuffling applied before the split. 42 is a popular random seed.
#stratification ensures equitable split of low/medium/high quality across training and test data
print(Counter(y_train), Counter(y_test))
print('Features:', list(X))

In [None]:
#SMOTE (synthetic minority oversampling technique) is one of the most commonly used oversampling methods to solve the imbalance problem. 
# It aims to balance class distribution by randomly increasing minority class examples by replicating them. 
# SMOTE synthesizes new minority instances between existing minority instances.

oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
lr = LogisticRegression(max_iter = 100)
lr.fit(X_train, y_train)