### ML

In [497]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [498]:
winelist = pd.read_csv('winelistall.csv')

## Encoding

In [499]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,red



[**Encoding**](https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/) is the process of converting a categorical value into a numerical value. Machine learning algorithms can only process numerical data, and depending on what your category is communicating, there are a few ways you can transform it to optimize how it is processed. 

## Label / Ordinal Encoding

**Label and ordinal encoding** will replace any categorical value with an integer. Label encoding works best for **boolean** or 2-option categories, but will also be necessary for your **target classes**. If your categories can also be considered "ranked", you should use ordinal encoding to preserve the order of the labels. There are many ways you can achieve this, the most practical is probably to use [`LabelEncoder()`](https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder) or [`OrdinalEncoder()`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder) from Scikit Learn.

[Here](https://www.geeksforgeeks.org/label-encoding-across-multiple-columns-in-scikit-learn/) is a good article demonstrating various multi-column approaches. 

<!-- Make sure to assign the dtype for the encoded column as [Categorical dtype](https://pandas.pydata.org/docs/user_guide/categorical.html#categoricaldtype).  -->

In [500]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

oe = OrdinalEncoder(categories=[["red", "white"]], dtype=int) # this is how you can define the label order
winelist['winetype'] = oe.fit_transform(winelist[['winetype']])

#le = LabelEncoder()
#dfm['ExerciseAngina'] = le.fit_transform(dfm['ExerciseAngina'])

#dfm[['Sex', 'ExerciseAngina']].head()


In [501]:
winelist

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,low,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,low,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,low,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,medium,0
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,low,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,medium,1
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,low,1
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,medium,1
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,medium,1


In [502]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,0


### Zeigt dir welche Spalte noch Objecte hat und unbedingt geändert werden sollte

In [503]:
cat_cols = winelist.select_dtypes('object')
for label, values in cat_cols.items():
    print(label, values.unique())

quality_label ['low' 'medium' 'high']


In [504]:
cat_col_encode = pd.get_dummies(cat_cols, drop_first=True, dtype='int')
cat_col_encode

Unnamed: 0,quality_label_low,quality_label_medium
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0
...,...,...
5315,0,1
5316,1,0
5317,0,1
5318,0,1


In [505]:
winelist.head()



Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,0


## Wichtiger Hinweis 
Die dritte spalte high wurde entfernt weil wenn alles null ist ist es sowieso high

In [506]:
# # drop categorical columns
# winelist.drop(columns=cat_cols, axis=1, inplace=True)

# # combine new columns with original dfm
# dfml = pd.concat([winelist, cat_col_encode], axis=1)

# # view first 5 rows
# winelist.head()

In [507]:
# Erstelle die One-Hot-Encoding-Spalten (wie du es bereits gemacht hast)
cat_col_encode = pd.get_dummies(cat_cols, drop_first=True, dtype='int')

# Füge die neuen Spalten zum ursprünglichen DataFrame hinzu
winelist = pd.concat([winelist, cat_col_encode], axis=1)

# Zeige das DataFrame an, um zu sehen, wie es aussieht
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,0,1,0


Noch quality label entfernen


In [508]:
winelist.drop(columns=['quality_label'], inplace=True)


In [509]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0,1,0


## Split Data

In [510]:
from sklearn.model_selection import train_test_split

X = winelist.drop('winetype', axis=1)
y = winelist['winetype']

print("original:", winelist.shape, "\nX:", X.shape, "\ny:", y.shape)

original: (5320, 15) 
X: (5320, 14) 
y: (5320,)


In [511]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=33)
print(X_train, X_test)

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
2979            8.8              0.34         0.33             9.7      0.036   
1796            6.9              0.25         0.40             1.3      0.038   
3123            6.4              0.20         0.32             3.1      0.041   
114             8.2              1.33         0.00             1.7      0.081   
5108            5.9              0.18         0.28             1.0      0.037   
...             ...               ...          ...             ...        ...   
57              7.7              0.69         0.49             1.8      0.115   
3273            6.8              0.32         0.30             3.3      0.029   
2706            6.3              0.24         0.74             1.4      0.172   
578            11.3              0.34         0.45             2.0      0.082   
2439            7.8              0.16         0.41             1.7      0.026   

      free_sulfur_dioxide  

# Building Model

Choose your model. There are two examples here, [`LogisticRegression`](https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html) and [`RandomForest`](https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html). Feel free to experiment with other classification models.

## Logistic Regression

You will need to import your model from SciKit Learn and set it to a variable. There are _many_ customization options, but start with the simplest setup and we will cover **hyperparameter tuning** in a later spike. 

Then use the [`.fit()`](https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.fit) method to "fit" or train your model using your training data.

In [512]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression(max_iter=1000) 

In [513]:
LR_model.fit(X_train, y_train)

## Prediction

In [514]:
preds = LR_model.predict(X_test)
preds

array([1, 1, 1, ..., 0, 1, 1])

In [515]:
y_test

2838    1
4697    1
2800    1
4530    1
1361    1
       ..
799     0
4432    1
1269    0
4454    1
3544    1
Name: winetype, Length: 1064, dtype: int32

In [516]:
from sklearn.metrics import accuracy_score 

acc = accuracy_score(y_test, preds)
acc

0.9802631578947368

## Random Forest

In [517]:
from sklearn.ensemble import RandomForestClassifier

In [518]:
RF_clf = RandomForestClassifier(random_state=33)

In [519]:
RF_clf.fit(X_train, y_train)

In [520]:
preds = RF_clf.predict(X_test)
preds

array([1, 1, 1, ..., 0, 1, 1])

In [521]:
y_test

2838    1
4697    1
2800    1
4530    1
1361    1
       ..
799     0
4432    1
1269    0
4454    1
3544    1
Name: winetype, Length: 1064, dtype: int32

In [522]:
print(y_test)

2838    1
4697    1
2800    1
4530    1
1361    1
       ..
799     0
4432    1
1269    0
4454    1
3544    1
Name: winetype, Length: 1064, dtype: int32


In [523]:
acc = accuracy_score(y_test, preds)
acc

0.9896616541353384

## Decision Tree for RED oder WHITE TYPE

Video: https://www.youtube.com/watch?v=YkYpGhsCx4c

In [524]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0,1,0


In [525]:
winelist = winelist.drop(columns=['quality_label_low','quality_label_medium','quality'])

In [526]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0


In [527]:
X = winelist.iloc[:, 0:11]
y = winelist.iloc[:, 11]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=17, test_size =0.2)


In [528]:
X_train.shape

(4256, 11)

In [529]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [530]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [531]:
dtc.fit(X_train, y_train)

In [532]:
y_pred = dtc.predict(X_test)
from sklearn.metrics import confusion_matrix

In [533]:
print(confusion_matrix(y_test, y_pred))

[[266   8]
 [ 15 775]]


In [534]:
from sklearn.metrics import classification_report

In [535]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       274
           1       0.99      0.98      0.99       790

    accuracy                           0.98      1064
   macro avg       0.97      0.98      0.97      1064
weighted avg       0.98      0.98      0.98      1064



In [536]:
dtc.feature_importances_

array([0.02980808, 0.01850458, 0.00145336, 0.00918019, 0.65798321,
       0.0038251 , 0.21520869, 0.03297051, 0.01315698, 0.00937885,
       0.00853044])

Zeigt uns die verwendete Werte für X

In [537]:
X.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

Hier sieht man welche werte wie wichtig für das Model sind

In [538]:
features = pd.DataFrame(dtc.feature_importances_, index = X.columns)
features.head(15)

Unnamed: 0,0
fixed_acidity,0.029808
volatile_acidity,0.018505
citric_acid,0.001453
residual_sugar,0.00918
chlorides,0.657983
free_sulfur_dioxide,0.003825
total_sulfur_dioxide,0.215209
density,0.032971
pH,0.013157
sulphates,0.009379


weitere Unterteilung

In [539]:
dtc2 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha= 0.04)

In [540]:
dtc2.fit(X_train,y_train)

In [541]:
y_pred2 = dtc2.predict(X_test)

In [542]:
print(confusion_matrix(y_test, y_pred2))

[[236  38]
 [ 14 776]]


In [543]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.94      0.86      0.90       274
           1       0.95      0.98      0.97       790

    accuracy                           0.95      1064
   macro avg       0.95      0.92      0.93      1064
weighted avg       0.95      0.95      0.95      1064



In [544]:
features2 = pd.DataFrame(dtc2.feature_importances_, index = X.columns)
features2.head(15)

Unnamed: 0,0
fixed_acidity,0.0
volatile_acidity,0.0
citric_acid,0.0
residual_sugar,0.0
chlorides,0.716283
free_sulfur_dioxide,0.0
total_sulfur_dioxide,0.283717
density,0.0
pH,0.0
sulphates,0.0


Der Baum hat Teile des Baumes entfernt die nicht so wichtig waren

## Random Forest for Quality Label

In [545]:
winelist = pd.read_csv('winelistall.csv')
winelist.head(9)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,red
5,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,low,red
6,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,medium,red
7,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,medium,red
8,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,low,red


In [546]:
# Erstelle die One-Hot-Encoding-Spalten (wie du es bereits gemacht hast)
cat_col_encode = pd.get_dummies(cat_cols, drop_first=True, dtype='int')

# Füge die neuen Spalten zum ursprünglichen DataFrame hinzu
winelist = pd.concat([winelist, cat_col_encode], axis=1)

# Zeige das DataFrame an, um zu sehen, wie es aussieht
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,red,1,0


In [547]:
reducedlist = winelist.drop(columns=['quality', 'winetype','quality_label'])

In [548]:
reducedlist

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality_label_low,quality_label_medium
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0,1
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0,1
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,1,0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0,1
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,0,1


In [549]:
from sklearn.model_selection import train_test_split

X = reducedlist.iloc[:, :11]
y = reducedlist.iloc[:, 11:]

print("original:", reducedlist.shape, "\nX:", X.shape, "\ny:", y.shape)

original: (5320, 13) 
X: (5320, 11) 
y: (5320, 2)


In [550]:
reducedlist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,1,0


In [551]:
print(reducedlist.iloc[:, :11])

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0               7.4              0.70         0.00             1.9      0.076   
1               7.8              0.88         0.00             2.6      0.098   
2               7.8              0.76         0.04             2.3      0.092   
3              11.2              0.28         0.56             1.9      0.075   
4               7.4              0.66         0.00             1.8      0.075   
...             ...               ...          ...             ...        ...   
5315            6.2              0.21         0.29             1.6      0.039   
5316            6.6              0.32         0.36             8.0      0.047   
5317            6.5              0.24         0.19             1.2      0.041   
5318            5.5              0.29         0.30             1.1      0.022   
5319            6.0              0.21         0.38             0.8      0.020   

      free_sulfur_dioxide  

In [552]:
print(reducedlist.iloc[:, 11:])

      quality_label_low  quality_label_medium
0                     1                     0
1                     1                     0
2                     1                     0
3                     0                     1
4                     1                     0
...                 ...                   ...
5315                  0                     1
5316                  1                     0
5317                  0                     1
5318                  0                     1
5319                  0                     1

[5320 rows x 2 columns]


In [553]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=420)
print(X_train, X_test)

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
786             6.8              0.41         0.31            8.80      0.084   
1465            7.4              0.39         0.23            7.00      0.033   
2016            6.9              0.22         0.37           15.00      0.053   
4618            7.3              0.33         0.22            1.40      0.041   
1318            7.1              0.68         0.00            2.30      0.087   
...             ...               ...          ...             ...        ...   
4895            6.4              0.18         0.28           17.05      0.047   
4671            6.8              0.30         0.26           20.30      0.037   
4486            5.9              0.18         0.29            4.60      0.032   
5192            5.1              0.25         0.36            1.30      0.035   
2993            5.7              0.45         0.42            1.10      0.051   

      free_sulfur_dioxide  

In [554]:
from sklearn.ensemble import RandomForestClassifier

In [555]:
RF_clf = RandomForestClassifier(random_state=40)


In [556]:
RF_clf.fit(X_train, y_train)

In [557]:
preds = RF_clf.predict(X_test)
preds

array([[0, 1],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]])

In [558]:
acc = accuracy_score(y_test, preds)
acc

0.7537593984962406

## Random Forest Again


In [559]:
winelist = pd.read_csv('winelistall.csv')
winelist.head(9)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,red
5,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5,low,red
6,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7,medium,red
7,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7,medium,red
8,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5,low,red


In [560]:
sizes = winelist['quality_label'].value_counts(sort=1)
print(sizes)

quality_label
medium    3179
low       1988
high       153
Name: count, dtype: int64


In [561]:
# Erstelle die One-Hot-Encoding-Spalten (wie du es bereits gemacht hast)
cat_col_encode = pd.get_dummies(cat_cols, drop_first=True, dtype='int')

# Füge die neuen Spalten zum ursprünglichen DataFrame hinzu
winelist = pd.concat([winelist, cat_col_encode], axis=1)

# Zeige das DataFrame an, um zu sehen, wie es aussieht
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,red,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,red,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,red,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,red,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,red,1,0


In [562]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

oe = OrdinalEncoder(categories=[["red", "white"]], dtype=int) # this is how you can define the label order
winelist['winetype'] = oe.fit_transform(winelist[['winetype']])

Unnötige Spalten entfernen

In [563]:
sizes = winelist['quality_label_low'].value_counts(sort=1)
print(sizes)

quality_label_low
0    3332
1    1988
Name: count, dtype: int64


In [564]:
sizes = winelist['quality_label_medium'].value_counts(sort=1)
print(sizes)

quality_label_medium
1    3179
0    2141
Name: count, dtype: int64


In [565]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,quality_label,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,low,0,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,low,0,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,low,0,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,medium,0,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,low,0,1,0


# Drop some Columns

In [None]:
winelist.drop(['quality','quality_label'], axis = 1, inplace =True) #inplace true updatet das Dataframe

In [568]:
winelist.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,winetype,quality_label_low,quality_label_medium
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0,1,0


In [None]:
y = winelist[['quality_label_low', 'quality_label_medium']]
X = winelist.drop(labels=['quality_label_low', 'quality_label_medium'], axis=1)

In [576]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=20)


In [577]:
print(X_train)

      fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
624             8.9             0.480         0.24            2.85      0.094   
4490            6.2             0.220         0.30           12.40      0.054   
1121            5.4             0.740         0.00            1.20      0.041   
2416            5.7             0.245         0.33            1.10      0.049   
5076            5.8             0.340         0.21            7.20      0.041   
...             ...               ...          ...             ...        ...   
1607            9.1             0.590         0.38            1.60      0.066   
3915            7.6             0.230         0.34            1.60      0.043   
1428            6.8             0.220         0.31            1.40      0.053   
4367            7.6             0.300         0.37            1.60      0.087   
2522            6.9             0.230         0.34            2.70      0.032   

      free_sulfur_dioxide  

In [578]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, random_state=30)

model.fit(X_train, y_train)


In [579]:
predicition_test=model.predict(X_test)
print(predicition_test)

[[0 1]
 [0 1]
 [0 1]
 ...
 [0 1]
 [0 1]
 [0 0]]


In [580]:
from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test,predicition_test))

Accuracy 0.6625939849624061
