In [3]:

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 

data = pd.read_csv('./wine_data.csv', index_col=0)

print(data['type'].value_counts())
data.info()


type
0    4898
1    1599
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 6497 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  type                  6497 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 710.6 KB


In [5]:

# target
y = data['type']
# features
X = data.drop('type', axis=1)

# split dataset for training && test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
     n_estimators=100,
     criterion='gini',
     max_depth=None,
     min_samples_split=2,
     min_samples_leaf=1,
     min_weight_fraction_leaf=0.0,
     max_features='sqrt',
     max_leaf_nodes=None,
     min_impurity_decrease=0.0,
     bootstrap=True,
     oob_score=False,
     n_jobs=None,
     random_state=None,
     verbose=0,
     warm_start=False,
     class_weight=None,
     ccp_alpha=0.0,
     max_samples=None
)

rf_model.fit(X_train, y_train)

pd.DataFrame(rf_model.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)


Unnamed: 0,importance
total sulfur dioxide,0.288161
chlorides,0.279517
volatile acidity,0.11484
density,0.065451
sulphates,0.056161
free sulfur dioxide,0.052918
residual sugar,0.04753
fixed acidity,0.043158
pH,0.024963
citric acid,0.014905


In [8]:

from sklearn.metrics import accuracy_score, confusion_matrix

print(f"Le pourcentage de vins bien classés est de : {accuracy_score(y_test, rf_model.predict(X_test))*100} %")

pd.DataFrame(confusion_matrix(y_test, rf_model.predict(X_test)), index=['blanc_données', 'rouge_données'], columns=['blanc_predit', 'rouge_predit'])


Le pourcentage de vins bien classés est de : 99.6923076923077 %


Unnamed: 0,blanc_predit,rouge_predit
blanc_données,986,0
rouge_données,4,310
