In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [8]:
data = pd.read_csv('train.csv')
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,white
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,red
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,red
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,white
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.50,9.1,5,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,7.2,0.2,0.19,7.7,0.045,53,176,0.9958,3.17,0.38,9.5,5,white
6710,6.7,0.3,0.34,7.5,0.036,39,124,0.9912,2.99,0.32,12.4,8,white
6711,6.6,0.3,0.24,3.3,0.034,29,99,0.9903,3.10,0.40,12.3,7,white
6712,8.0,0.2,0.31,5.6,0.049,24,97,0.9930,3.10,0.42,10.9,5,white


In [9]:
data['type'] = data['type'].map({'white': 1, 'red': 2})
data = data.drop(['sulphates', 'pH', 'fixed acidity', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6714 entries, 0 to 6713
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   volatile acidity  6714 non-null   float64
 1   citric acid       6714 non-null   float64
 2   chlorides         6714 non-null   float64
 3   density           6714 non-null   float64
 4   alcohol           6714 non-null   float64
 5   quality           6714 non-null   int64  
 6   type              6714 non-null   int64  
dtypes: float64(5), int64(2)
memory usage: 367.3 KB


In [10]:
target = 'quality'
x = data.drop(target, axis=1)
y = data[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [11]:
clf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(x_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [12]:
# Sử dụng mô hình tốt nhất để dự đoán
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(x_test)

# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.6664184661206255
Classification Report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.45      0.20      0.28        44
           5       0.68      0.70      0.69       464
           6       0.67      0.72      0.69       582
           7       0.68      0.61      0.64       212
           8       0.50      0.33      0.40        33
           9       0.00      0.00      0.00         4

    accuracy                           0.67      1343
   macro avg       0.43      0.37      0.39      1343
weighted avg       0.66      0.67      0.66      1343

Confusion Matrix:
 [[  0   1   2   0   1   0   0]
 [  1   9  23  11   0   0   0]
 [  0   7 327 118  10   2   0]
 [  0   3 121 419  38   1   0]
 [  0   0   8  69 129   6   0]
 [  0   0   1  10  10  11   1]
 [  0   0   0   1   1   2   0]]


In [14]:
data_test = pd.read_csv('test.csv')

id = data_test['id']
id = id.values

data_test['type'] = data_test['type'].map({'white': 1, 'red': 2})
data_test = data_test.drop(['sulphates', 'pH', 'fixed acidity', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide',"id"], axis=1)
data_test = scaler.transform(data_test)
data_test

array([[-0.58132483,  0.36129652,  0.13002844, -0.13028639,  1.62428381,
         1.65694515],
       [-0.46454893,  0.49627069, -0.48120292,  0.93101544, -0.80653145,
        -0.60352028],
       [-0.17260918,  1.17114152, -0.20105521,  0.26976819, -0.89035267,
        -0.60352028],
       ...,
       [-0.52293688,  0.36129652,  0.4611121 ,  0.86489072, -0.72271024,
         1.65694515],
       [-0.34777303, -0.17860015, -0.53213887, -1.74703591,  1.95956867,
        -0.60352028],
       [-0.87326459,  0.29380943, -0.76135063, -0.32535433,  0.11550192,
        -0.60352028]])

In [15]:
data_predict = best_clf.predict(data_test)
data_predict

array([7, 6, 5, 8, 6, 5, 5, 7, 6, 7, 5, 6, 6, 7, 6, 6, 6, 6, 6, 5, 6, 6,
       5, 7, 5, 5, 6, 6, 5, 6, 6, 6, 6, 7, 6, 5, 6, 5, 6, 7, 5, 6, 5, 5,
       5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 5, 7, 5, 5, 6, 5, 7, 6, 5, 5, 4, 6,
       5, 6, 5, 6, 5, 6, 5, 6, 6, 6, 5, 6, 5, 6, 5, 5, 7, 6, 5, 6, 7, 5,
       6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 7, 5, 6, 6, 5, 6, 5, 6, 6, 7, 6, 5,
       5, 6, 5, 5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 6, 5, 5, 5,
       6, 5, 5, 6, 5, 7, 5, 7, 7, 6, 5, 6, 5, 7, 7, 5, 5, 5, 5, 5, 6, 5,
       6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 6, 8, 6, 6, 6,
       6, 7, 5, 5, 5, 6, 6, 6, 7, 6, 5, 7, 5, 6, 5, 7, 6, 5, 5, 5, 5, 7,
       5, 5, 6, 6, 6, 5, 5, 5, 6, 5, 8, 6, 5, 6, 7, 5, 6, 6, 5, 5, 7, 5,
       6, 6, 5, 5, 5, 7, 6, 6, 6, 5, 5, 5, 6, 6, 5, 6, 6, 7, 7, 4, 5, 5,
       5, 6, 6, 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 5, 6, 5, 6, 5, 5, 5, 7, 5,
       5, 5, 6, 5, 7, 6, 5, 6, 5, 7, 6, 5, 7, 6, 6, 7, 5, 6, 6, 5, 5, 8,
       6, 6, 6, 6, 5, 5, 5, 6, 5, 6, 6, 6, 6, 5, 6,

In [16]:
import csv

file_name = "result.csv"

column_names = ["id", "quality"]

id = np.round(id)
data_predict = np.round(data_predict)

data = zip(id, data_predict)

with open(file_name, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(column_names)
    
    for row in data:
        writer.writerow(row)

print("File CSV đã được tạo và dữ liệu đã được ghi vào thành công!")


File CSV đã được tạo và dữ liệu đã được ghi vào thành công!
