In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'..\Data\wine.csv', sep=';')
headers = list(df.columns.values)
data = np.array(df)

In [4]:
# a. Sử dụng toàn bộ 11 đặc trưng đề bài cung cấp.
def calc_x_hat(data):
    A = data[:,:-1]
    b = data[:,-1:].reshape(len(data),)
    return np.linalg.pinv(A) @ b

x_hat = calc_x_hat(data)
for i in range(len(x_hat)):
    print('x'+ str(i + 1) + ' = ' + str(x_hat[i]))

x1 = 0.0059251613739160075
x2 = -1.1080375422598763
x3 = -0.26304628369886207
x4 = 0.015322283066628407
x5 = -1.7305027430592603
x6 = 0.003801419076872639
x7 = -0.003898998694544565
x8 = 4.338587684504782
x9 = -0.45853547521854976
x10 = 0.7297186624705582
x11 = 0.30885864844890126


In [8]:
# b. Sử dụng duy nhất 1 đặc trưng cho kết quả tốt nhất. (Phương pháp Cross Validation)
def sub_calc_avg_residual(train, test):
    x_hat = calc_x_hat(train)
    sum_residual = 0
    
    for i in range(len(test)):
        est_quality = 0
        for j in range(len(x_hat)):
            est_quality += x_hat[j] * test[i][j]
        sum_residual += np.abs(est_quality - test[i][-1])
        
    return sum_residual / len(test)


def calc_avg_residual(data, model_num):
    n = int(np.round(len(data) / model_num))
    models = [data[i*n:i*n+n, :] for i in range(model_num-1)]
    models.append(data[(model_num-1)*n:, :])

    sum_avg_residual = 0
    for i in range(model_num):
        train = np.concatenate(models[:-1], axis=0)
        test = models[-1]
        sum_avg_residual += sub_calc_avg_residual(train, test)
        
        temp = models[-1]
        models[-1] = models[i]
        models[i] = temp
    
    return sum_avg_residual / model_num


def cross_validation(data, model_num=4):
    avg_residuals = []
    
    for i in range(len(data[0])-1):
        sub_data = np.concatenate((data[:, i:i+1], data[:, -1:]), axis=1)
        avg_residuals.append(calc_avg_residual(sub_data, model_num))
    
    return np.argsort(avg_residuals), np.sort(avg_residuals)


indices, values = cross_validation(data)
for i in range(len(indices)):
    print('#' + str(i+1) + ': ' + headers[indices[i]] + ' (' + str(values[i]) + ')')

#1: alcohol (0.547671112842504)
#2: density (0.7147989733781712)
#3: pH (0.7426436192032513)
#4: fixed acidity (1.07724212539448)
#5: sulphates (1.0982698120183385)
#6: volatile acidity (1.864347660175296)
#7: residual sugar (2.014247435112465)
#8: chlorides (2.1095866742816884)
#9: citric acid (2.648583394468612)
#10: free sulfur dioxide (2.843535296019925)
#11: total sulfur dioxide (3.1392660467790603)
