In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'..\Data\wine.csv', sep=';')
headers = list(df.columns.values)
data = np.array(df)
df.squeeze()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1194,7.0,0.745,0.12,1.8,0.114,15.0,64,0.99588,3.22,0.59,9.5,6
1195,6.2,0.430,0.22,1.8,0.078,21.0,56,0.99633,3.52,0.60,9.5,6
1196,7.9,0.580,0.23,2.3,0.076,23.0,94,0.99686,3.21,0.58,9.5,6
1197,7.7,0.570,0.21,1.5,0.069,4.0,9,0.99458,3.16,0.54,9.8,6


In [3]:
# a. Sử dụng toàn bộ 11 đặc trưng đề bài cung cấp.
def calc_x_hat(data):
    A = data[:,:-1]
    b = data[:,-1:].reshape(len(data),)
    return np.linalg.pinv(A) @ b

x_hat = calc_x_hat(data)
for i in range(len(x_hat)):
    print('x'+ str(i + 1) + ' = ' + str(x_hat[i]))

x1 = 0.0059251613739160075
x2 = -1.1080375422598763
x3 = -0.26304628369886207
x4 = 0.015322283066628407
x5 = -1.7305027430592603
x6 = 0.003801419076872639
x7 = -0.003898998694544565
x8 = 4.338587684504782
x9 = -0.45853547521854976
x10 = 0.7297186624705582
x11 = 0.30885864844890126


In [9]:
# b. Sử dụng duy nhất 1 đặc trưng cho kết quả tốt nhất. (Phương pháp Cross Validation)
def sub_calc_avg_residual(train, test):
    x_hat = calc_x_hat(train)
    sum_residual = 0
    
    for i in range(len(test)):
        est_quality = 0
        for j in range(len(x_hat)):
            est_quality += x_hat[j] * test[i][j]
        sum_residual += np.abs(est_quality - test[i][-1])
        
    return sum_residual / len(test)


def calc_avg_residual(data, model_num):
    n = int(np.round(len(data) / model_num))
    models = [data[i*n:i*n+n, :] for i in range(model_num-1)]
    models.append(data[(model_num-1)*n:, :])

    avg_residuals = []
    for i in range(model_num):
        train = np.concatenate(models[:-1], axis=0)
        test = models[-1]
        avg_residuals.append(sub_calc_avg_residual(train, test))
        
        temp = models[-1]
        models[-1] = models[i]
        models[i] = temp
    
    return np.average(avg_residuals), avg_residuals


def feature_ranking(data, model_num=4):
    avg_residuals = []
    sub_avg_residuals = []
    
    for i in range(len(data[0])-1):
        sub_data = np.concatenate((data[:, i:i+1], data[:, -1:]), axis=1)
        avg_residual, sub_avg_residual = calc_avg_residual(sub_data, model_num)
        avg_residuals.append(avg_residual)
        sub_avg_residuals.append(sub_avg_residual)
    
    return np.argsort(avg_residuals), avg_residuals, sub_avg_residuals


def display_ranking(data, headers):
    indices, avg_residuals, sub_avg_residuals = feature_ranking(data)
    
    ranking_data = []
    for i in range(len(indices)):
        row = []
        row.append(headers[indices[i]])
        row.append(sub_avg_residuals[indices[i]][0])
        row.append(sub_avg_residuals[indices[i]][1])
        row.append(sub_avg_residuals[indices[i]][2])
        row.append(sub_avg_residuals[indices[i]][3])
        row.append(avg_residuals[indices[i]])
        row.append(i + 1)
        ranking_data.append(row)
        
    tab = pd.DataFrame(ranking_data, columns=["Tính chất", "Sai số A", "Sai số B", "Sai số C", "Sai số D", "Sai số TB", "Xếp hạng"])
    print(tab)


display_ranking(data, headers)

               Tính chất  Sai số A  Sai số B  Sai số C  Sai số D  Sai số TB  \
0                alcohol  0.556146  0.525760  0.596566  0.512212   0.547671   
1                density  0.782727  0.696102  0.685753  0.694613   0.714799   
2                     pH  0.791555  0.722449  0.736420  0.720151   0.742644   
3          fixed acidity  1.221617  0.856488  1.239303  0.991560   1.077242   
4              sulphates  1.200110  1.253365  0.935362  1.004242   1.098270   
5       volatile acidity  2.296810  1.618290  1.841731  1.700559   1.864348   
6         residual sugar  2.545658  1.988977  1.894290  1.628064   2.014247   
7              chlorides  2.536739  1.932540  2.099208  1.869860   2.109587   
8            citric acid  2.488746  2.948598  2.370226  2.786764   2.648583   
9    free sulfur dioxide  3.255725  2.518466  3.123854  2.476097   2.843535   
10  total sulfur dioxide  3.647015  2.798878  3.230203  2.880968   3.139266   

    Xếp hạng  
0          1  
1          2  
2     

In [5]:
# c. Xây dựng một mô hình của riêng bạn cho kết quả tốt nhất. (Lấy 6 đặc trưng cho kết quả tốt nhất)
def my_cross_validation(data, model_num=4):
    n = 6
    indices, _, _ = feature_ranking(data)
    best_indices = indices[:n]
    
    best_data = []
    for i in best_indices:
        best_data.append(data[:, i:i+1])
    best_data.append(data[:, -1:])
    best_data = np.concatenate(best_data, axis=1)
    
    return calc_avg_residual(best_data, model_num)[0]


def original_cross_validation(data, model_num=4):
    return calc_avg_residual(data, model_num)[0]


my_residual = my_cross_validation(data)
original_residual = original_cross_validation(data)
print('Sai số theo phương pháp của em: ' + str(my_residual))
print('Sai số theo phương pháp của câu a: ' + str(original_residual))
print('Chênh lệch giữa 2 sai số: ' + str(np.abs(my_residual - original_residual)))

Sai số theo phương pháp của em: 0.5312640513600049
Sai số theo phương pháp của câu a: 0.5150558048747267
Chênh lệch giữa 2 sai số: 0.016208246485278188
