In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, ExtraTreesClassifier, StackingRegressor, GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor
import random
import seaborn as sns
import warnings as wr
wr.filterwarnings('ignore')


In [22]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [23]:
def convert_color(color):
    if color == 'white':
        return 1000
    else:
        return 0

# Áp dụng hàm chuyển đổi cho mỗi giá trị trong cột 'colors'
train_data['type'] = train_data['type'].apply(convert_color)
test_data['type'] = test_data['type'].apply(convert_color)

In [24]:
train_data['feature_axit'] = train_data['fixed acidity'] * train_data['citric acid'] * train_data['volatile acidity'] * train_data['chlorides']
# train_data['feature_dioxide'] = train_data['free sulfur dioxide'] + train_data['total sulfur dioxide'] + train_data['sulphates']
train_data['feature_new'] = train_data['total sulfur dioxide'] * train_data['alcohol']
train_data['total acidity'] = train_data['fixed acidity'] + train_data['volatile acidity']
train_data['citric/total'] = train_data['citric acid'] / train_data['total acidity']
train_data['feature_fsd'] = train_data['free sulfur dioxide'] / (1 + 10**(train_data['pH'] - 1.81))
train_data['alcohol_times_ph'] = train_data['pH'] * train_data['alcohol']

In [25]:
test_data['feature_axit'] = test_data['fixed acidity'] * test_data['citric acid'] * test_data['volatile acidity'] * test_data['chlorides']
# test_data['feature_dioxide'] = test_data['free sulfur dioxide'] + test_data['total sulfur dioxide'] + test_data['sulphates']
test_data['feature_new'] = test_data['total sulfur dioxide'] * test_data['alcohol']
test_data['total acidity'] = test_data['fixed acidity'] + test_data['volatile acidity']
test_data['citric/total'] = test_data['citric acid'] / test_data['total acidity']
test_data['feature_fsd'] = test_data['free sulfur dioxide'] / (1 + 10**(test_data['pH'] - 1.81))
test_data['alcohol_times_ph'] = test_data['pH'] * test_data['alcohol']

In [7]:
col_name = train_data.columns.tolist()

In [8]:
def add_quality(x,y):
    global train_data
    ql = train_data.quality
    list_ql = []
    for i in range(len(train_data)):
        if ql[i] == x: list_ql.append(train_data.iloc[i].values)
    matrix = np.array(list_ql)
    m = 0 
    while m < y:
        for i in range(train_data.shape[1]): random.shuffle(matrix[:,i])
        train_data = list(train_data.values) + list(matrix)
        train_data = pd.DataFrame(train_data)
        m += 1
    train_data.columns = col_name

In [9]:
train_data.quality.value_counts()

quality
6    2809
5    2365
7    1062
4     230
8     212
3      23
9      13
Name: count, dtype: int64

In [10]:
add_quality(7,18)
add_quality(5,8)
add_quality(6,7)
add_quality(4,40)
add_quality(8,40)

In [11]:
train_data.quality.value_counts()

quality
6.0    22472
5.0    21285
7.0    20178
4.0     9430
8.0     8692
3.0       23
9.0       13
Name: count, dtype: int64

In [12]:
col = ['alcohol','alcohol_times_ph','feature_fsd','feature_axit','sulphates','citric/total','feature_axit','feature_new','type','density','chlorides','volatile acidity','free sulfur dioxide','total sulfur dioxide','citric acid','fixed acidity']

In [13]:
train_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,feature_axit,feature_new,total acidity,citric/total,feature_fsd,alcohol_times_ph
0,6.6,0.3,0.36,1.2,0.035,43.0,126.0,0.9909,3.01,0.63,11.4,6.0,1000.0,0.024948,1436.4,6.9,0.052174,2.552091,34.314
1,7.7,0.5,0.26,1.9,0.062,9.0,31.0,0.9966,3.39,0.64,9.6,5.0,0.0,0.062062,297.6,8.2,0.031707,0.230657,32.544
2,8.4,0.5,0.35,2.9,0.076,21.0,127.0,0.9976,3.23,0.63,9.2,5.0,0.0,0.111720,1168.4,8.9,0.039326,0.769155,29.716
3,7.5,0.4,0.33,5.0,0.045,30.0,131.0,0.9942,3.32,0.44,10.9,6.0,1000.0,0.044550,1427.9,7.9,0.041772,0.899298,36.188
4,6.4,0.2,0.25,20.2,0.083,35.0,157.0,0.9998,3.17,0.50,9.1,5.0,1000.0,0.026560,1428.7,6.6,0.037879,1.463904,28.847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82088,6.2,0.3,0.32,1.7,0.033,30.0,128.0,0.9926,3.25,0.73,13.4,8.0,1000.0,0.011206,655.5,7.0,0.036000,0.407486,40.176
82089,6.5,0.4,0.30,4.1,0.087,3.0,88.0,0.9904,3.08,0.46,12.7,8.0,1000.0,0.020155,1522.8,8.1,0.082979,0.850113,41.363
82090,6.2,0.3,0.32,1.4,0.041,39.0,213.0,0.9936,3.24,0.46,13.0,8.0,1000.0,0.025536,1650.0,8.3,0.047368,0.501516,25.872
82091,12.6,0.4,0.28,9.1,0.052,96.0,110.0,0.9981,3.23,0.48,11.2,8.0,1000.0,0.022469,2022.7,7.2,0.036620,1.692633,40.440


In [14]:
X = train_data .drop(columns = ['quality'])
y = train_data['quality']
print(X.shape, y.shape)

(82093, 18) (82093,)


In [15]:
X_test = test_data.drop(columns = ['id'])
print(X_test.shape)

(820, 18)


In [None]:
while True:
    ex_reg = ExtraTreesRegressor()
    ex_reg.fit(X, y)
    y_pred = ex_reg.predict(X_test)
    demo_data = pd.read_csv("./predicted_scores.csv")
    y_test = demo_data['quality']
    rmse = np.sqrt(mean_squared_error(y_pred, y_test))
    print('RMSE: ', rmse)
    if rmse < 0.74: break
    else: continue

RMSE:  0.7507829652939721
RMSE:  0.7553977106263376
RMSE:  0.7542463731854564
RMSE:  0.7434089656736071
RMSE:  0.7496114440649707
RMSE:  0.7573788843044437
RMSE:  0.7490717222816192
RMSE:  0.7477544228082293


In [None]:
stt = test_data.id.values
df1 = pd.DataFrame(stt.T, columns = ['id'])
df2 = pd.DataFrame(y_pred.T, columns = ['quality'])
df = pd.concat([df1,df2], axis = 1)
df.to_csv("/Users/daoxuanbac/Desktop/Python/SU23_AIL/Wine_Quality/Result/EXT83.csv", index = False)

In [26]:
df = pd.read_csv("predicted_scores.csv")
np.unique(df["quality"], return_counts=True)

(array([3, 4, 5, 6, 7, 8], dtype=int64),
 array([ 11,  49, 198, 356, 168,  38], dtype=int64))