## Wczytaj potrzebne biblioteki

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import signal
%matplotlib inline

from sklearn import linear_model
from sklearn.metrics import r2_score
from scipy.optimize import differential_evolution
from sklearn import preprocessing

print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(mpl.__version__))

pandas version: 1.0.3
numpy version: 1.18.4
matplotlib version: 3.2.0


## Wczytaj dane

In [3]:
def changeDateToSeconds(df):
    first = df["date"][0]
    df["date"] = df["date"].apply(lambda timestamp: (timestamp-first).seconds)
    return df

def readDataFromExcel(path, sheet):
    df = pd.read_excel(path, sheet_name=sheet, na_values=[" Bad Data","Bad Data"])
    df["date"] = pd.to_datetime(df["date"])
    df = changeDateToSeconds(df)
    return df

df2 = readDataFromExcel("./data/K-1_MI.xlsx", "d2")
df3 = readDataFromExcel("./data/K-1_MI.xlsx", "d3")
df5 = readDataFromExcel("./data/K-1_MI.xlsx", "d5")
df6 = readDataFromExcel("./data/K-1_MI.xlsx", "d6")

df6 = df6.iloc[::5, :] # dopasuj okresy próbkowania

dataFrames = [df2, df3, df5, df6]

## Skalowanie i oczyszczanie danych

In [4]:
scaledDf = df2.append(df3).append(df5).append(df6)
scaler = preprocessing.StandardScaler()
scaledDf = scaledDf.drop(["date"], axis=1)
columns = scaledDf.columns

scaler.fit(scaledDf)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
scaledDataFrames = list()
for df in dataFrames:
    df = scaler.transform(df.drop(["date"], axis=1))
    scaledDataFrames.append(pd.DataFrame(df, columns=columns))

### Tworzenie macierzy regresji

In [6]:
def prepareSignals(df, inputs, output, medfilt=False, kernelSize=11):
    inputSignals = df[inputs]
    outputSignal = df[output]
    inputFrame = pd.concat([inputSignals, outputSignal.shift(1)], axis=1).iloc[1:]

    if(medfilt == True):
        u_signals = applyMedianFilter(inputFrame.to_numpy(), kernelSize)
        y_signal = applyMedianFilter(df[output].iloc[1:].to_numpy(), kernelSize)

    else:
        u_signals = inputFrame.to_numpy()
        y_signal =  df[output].iloc[1:].to_numpy()
        
    return (u_signals, y_signal)

In [7]:
def createModelMatrixForSingleInput(data, order, delay, exponent):
    if(order < 0 or delay < 0 or exponent <= 0):
        raise AssertionError("Invalid structure parameter")
        
    samples = data.shape[0]
    widthCoefficient = (order + 1)*exponent
    heightAbsoluteTerm = order + delay
    
    A = np.zeros([samples - heightAbsoluteTerm, widthCoefficient])
    
    for j in range(order+1):
        for k in range(exponent):
            colIndex = (order-j)*exponent + k
            A[:, colIndex] = np.power(data[j : samples-heightAbsoluteTerm+j], k+1)
    
    return A

In [8]:
def findMaxDelayAndOrder(M):
    maxDelay = 0; maxOrder = 0
    for index, parameters in enumerate(M.T):
        order, delay, exponent = parameters
        if(order > maxOrder):
            maxOrder = order
        if(delay > maxDelay):
            maxDelay = delay
    
    return maxDelay, maxOrder

In [9]:
def createModelMatrix(data, M):
    if(M.shape[0] != 3):
        raise AssertionError("Invalid parameter vector size")

    if(M.shape[1] != data.shape[1]):
        raise AssertionError("Mismatched size of data: {} and M: {} vector".format(data.shape, M.shape))
        
    inputs = M.shape[1]
    height = data.shape[0]
    
    maxDelay, maxOrder = findMaxDelayAndOrder(M)
    A = np.empty(shape=(height-maxOrder-maxDelay, 0)) 
    for index, parameters in enumerate(M.T):
        # stworz macierz dla danego wejscia
        inputData = data[:, index]
        order, delay, exponent = parameters
        Ap = createModelMatrixForSingleInput(data[:, index], order, delay, exponent)
        
        # obetnij macierz - delay od góry, a order od dołu macierzy
        delayMaxDiff = np.abs(maxDelay-delay)
        orderMaxDiff = np.abs(maxOrder-order)
        baseHeight = Ap.shape[0]
        validA = Ap[orderMaxDiff+delayMaxDiff:]
        validA = validA[:A.shape[0]]
        
        # dodaj do akumulatora
        A = np.concatenate((A, validA), axis=1)
        
    return A

## Oblicz wynik modelu

In [10]:
def evalModelScore(model, M, u_verif, y_verif):
    maxDelay, maxOrder = findMaxDelayAndOrder(M)   
    numberOfSamples = y_verif.shape[0]
    output_verif_cut = y_verif[maxDelay : numberOfSamples - maxOrder]
    
    data_idx = maxOrder+maxDelay+1
    u_verif_wip = np.array(u_verif[0:data_idx],copy=True)
    A_verif = createModelMatrix(u_verif_wip, M)
    
    model_outputs = np.array([])
    for i in range(data_idx, data_idx+len(output_verif_cut)):
        output_model_verif = model.predict(A_verif)
        model_outputs = np.append(model_outputs, output_model_verif)
        u_verif_wip = np.vstack((u_verif_wip, u_verif[data_idx:data_idx+1]))[1:]
        data_idx += 1
        u_verif_wip[-1][-1] = output_model_verif[-1]
        A_verif = createModelMatrix(u_verif_wip, M) 
        
    verif_score = r2_score(output_verif_cut, model_outputs)
    return verif_score

In [11]:
def crossValidate(M, dataFrames, inputs, outputs):
    maxDelay, maxOrder = findMaxDelayAndOrder(M)
    regressors = inputs + outputs
    scores = list()
    for df_verif in dataFrames:
        u_verif, y_verif = prepareSignals(df_verif, inputs, outputs)
        A_learn = None
        y_learn = None
        for df_learn in dataFrames:
            if df_verif.equals(df_learn):
                continue
            
            u, y = prepareSignals(df_learn, inputs, outputs)
            A = createModelMatrix(u, M)
            
            if A_learn is None:
                A_learn = np.empty((0, A.shape[1]))
            if y_learn is None:
                y_learn = np.empty((0, y.shape[1]))
            
            numberOfSamples = y.shape[0]
            y_cut = y[maxDelay : numberOfSamples - maxOrder]
            A_learn = np.vstack((A_learn, A))
            y_learn = np.vstack((y_learn, y_cut))
        
        
        model = linear_model.LinearRegression().fit(A_learn, y_learn)
        score = evalModelScore(model, M, u_verif, y_verif)
        scores.append(score)
    
    return sum(scores)/len(scores)

## Ręczne szukanie struktury modelu

## Uruchom algorytm genetyczny do znalezienia struktury modelu

In [12]:
inputs = ["FWF", "PP", "DP"]
output = ["LT01"]

cache = {}

def goalFunction(x):
    # zbuduj macierz M na podstawie wektora x
    M = np.empty(shape=(3, len(inputs) + 1), dtype=np.int32)
    for index, value in enumerate(x):
        row = index % 3; col = index // 3
        M[row, col] = int(value)
        
    M_hash = hash(str(M))
    if M_hash in cache:
        score = cache[M_hash]
    else:
        score = crossValidate(M, scaledDataFrames, inputs, output)
        cache[M_hash] = score
        
    return (-1)*score

In [15]:
bounds = [(0, 5), (0,30), (1, 2)]*(len(inputs) + 1)
bounds[-2] = (1,1)
result = differential_evolution(goalFunction, bounds, disp=True, polish=False, tol=10.0, workers=-1)

differential_evolution step 1: f(x)= 0.0139437


In [13]:
result

NameError: name 'result' is not defined

In [1]:
optimM = np.empty(shape=(3, len(inputs) + 1), dtype=np.int32)
for index, value in enumerate(result.x):
    row = index % 3; col = index // 3
    optimM[row, col] = int(value)
print(optimM)

u_learn, y_learn = prepareSignals(scaledDataFrames[0], inputs, output)
u_verif, y_verif = prepareSignals(scaledDataFrames[0], inputs, output)

maxDelay, maxOrder = findMaxDelayAndOrder(optimM)   
numberOfSamples = y_learn.shape[0]
output_learn_cut = y_learn[maxDelay : numberOfSamples - maxOrder]
output_verif_cut = y_verif[maxDelay : numberOfSamples - maxOrder]

A_learn = createModelMatrix(u_learn, optimM)
model = linear_model.LinearRegression().fit(A_learn, output_learn_cut)
score = evalModelScore(model, optimM, u_verif, y_verif)

print(model.coef_)

maxDelay, maxOrder = findMaxDelayAndOrder(optimM)
numberOfSamples = y_learn.shape[0]
output_learn_cut = y_learn[maxDelay : numberOfSamples - maxOrder]
output_verif_cut = y_verif[maxDelay : numberOfSamples - maxOrder]

# A_verif = createModelMatrix(u_verif, optimM)
A_learn = createModelMatrix(u_learn, optimM)
y_model_learn = model.predict(A_learn)
# y_model_verif = model.predict(A_verif)

data_idx = maxOrder+maxDelay+1
u_verif_wip = np.array(u_verif[0:data_idx],copy=True)
A_verif = createModelMatrix(u_verif_wip, optimM)
    
model_outputs = np.array([])
for i in range(data_idx, data_idx+len(output_learn_cut)):
    output_model_verif = model.predict(A_verif)
    model_outputs = np.append(model_outputs, output_model_verif)
    u_verif_wip = np.vstack((u_verif_wip, u_verif[data_idx:data_idx+1]))[1:]
    data_idx += 1
    u_verif_wip[-1][-1] = output_model_verif[-1]
    A_verif = createModelMatrix(u_verif_wip, optimM) 
        
verif_score = r2_score(output_verif_cut, model_outputs)
print(verif_score)

# plot data
fig, axs = plt.subplots(2,figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
axs[0].plot(output_learn_cut)
axs[0].plot(y_model_learn)
axs[0].set_title("Dane uczące")
axs[1].plot(output_verif_cut)
axs[1].plot(model_outputs)
axs[1].set_title("Dane weryfikacyjne")
plt.rcParams['figure.figsize'] = [15, 20]

NameError: name 'np' is not defined