In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, scale, minmax_scale, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import gaussian_kde
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

In [111]:
def create_data(data_X, data_Y, number_X, number_Y):
    '''
    Basic idea:
    create several sub-dataframes and concat them together.
    For instance:
    Samples : 1,2,3,4,5,6,7
    number X = 3
    number Y = 2
    
    for measurements, create data frames:
    1    |    2    |    3
    2    |    3    |    4
    3    |    4    |    5
    
    Then concat them together, we can get 3 new sample measurements:
    
    Sample 1: 1, 2, 3
    Sample 2: 2, 3, 4
    Sample 3: 3, 4, 5
    '''
    
    # to get the coloumn names
    features_name = [col for col in data_X]
    target_name = [col for col in data_Y]
    X_shape = data_X.shape
    index_X = data_X.index
    number_samples = X_shape[0]
    
    number_to_delete = number_X + number_Y -1
    number_measurements = number_samples - number_to_delete
    temp_X = []
    rst_df = None
    rst_index = None
    # create number_X dataframes and concat them together
    # for each dataframe, we should delete number_X -1 entries
    # create the delete lish
    for i in range(number_X):
        data_copy = data_X.copy()
        '''
        del_list contains the index of samples which should be deleted for each sub-dataframes
        '''
        del_list = []
        
        for j in range(i):
            del_list.append(index_X[j])
        for k in range(number_to_delete - i):
            del_list.append(index_X[number_samples - k -1])
        if i == 0:
            # the first sub-dataframe, create it directly
            rst_df = data_copy.drop(del_list, axis=0)
            rst_df.columns = [j+str(i) for j in features_name]
            rst_index = rst_df.index
        else:
            # concat the new sub-dataframe with the previous result
#             print(del_list)
            temp_df = data_copy.drop(del_list, axis=0)
            temp_df.index = rst_index
            temp_df.columns = [j+str(i) for j in features_name]
            rst_df = pd.concat([rst_df, temp_df], axis=1)
#             print(rst_df.shape)
            
    for i in range(number_Y):
        target_copy = data_Y.copy()
        del_list = []
        for j in range(number_X + i):
            del_list.append(index_X[j])
        for k in range(number_Y -1 - i):
            del_list.append(index_X[number_samples - k -1])
#         print(del_list)
        if i == 0:
            tgt_df = target_copy.drop(del_list, axis=0)
            tgt_df.columns = [j+str(i) for j in target_name]
            tgt_index = tgt_df.index
        else:
#             print(del_list)
            temp_df = target_copy.drop(del_list, axis=0)
            temp_df.index = tgt_index
            temp_df.columns = [j+str(i) for j in target_name]
            tgt_df = pd.concat([tgt_df, temp_df], axis=1)
            print(tgt_df.shape)
    return rst_df, tgt_df

In [13]:
# input data should be np array, only outliers in X will be considered
def OutlierRemoval(InputX, InputY, Threhold):
    # combine X and Y
    combine_data = np.c_[InputX, InputY]
    
    # remove outlier in X
    removed_data = combine_data[np.all(np.abs(np.delete(combine_data, -1, 1)) < Threhold ,axis=1)]
    
    # spilit removed_data into X and Y
    removed_x = np.delete(removed_data, -1, 1)
    removed_y = removed_data[:,[-1]]
    
    return removed_x, removed_y

# All input data should be np array
def CalculateNMAE(PredictData, TestData):
    return mean_absolute_error(PredictData, TestData)/TestData.mean()


# feature selection, only return X since we don't change Y
def TreeBasedSelection(InputX, InputY, FeatureNumber):
    # create and fit selector
    clf = ExtraTreesRegressor()
    clf = clf.fit(InputX, InputY)
    
    # only number of sorted features will be selected, and we disable threshold
    model = SelectFromModel(clf, prefit=True, max_features = FeatureNumber, threshold=-np.inf)
    OutputX = model.transform(InputX)
    return OutputX

# standardize column value
def ColumnStandardize(Input):
    Scaler = StandardScaler()
    return Scaler.fit_transform(Input)

In [97]:
'''
To defince the dataset we use
'''
# fileName_X = './VoD_Periodic_2017/X.csv'
# fileName_Y = './VoD_Periodic_2017/Y.csv'
# fileName_X = './KV_flash/X.csv'
# fileName_Y = './KV_flash/Y.csv'
fileName_X = '../X.csv'
fileName_Y = '../Y.csv'
# fileName_X = './VoD_flash/X.csv'
# fileName_Y = './VoD_flash/Y.csv'

In [98]:
'''
Read data from csv file.
X and Y are raw data
Y_notime and Y_notime are data without timestamp
X_features are a list which contains the name of all of the features in X
'''
X = pd.read_csv(fileName_X)
Y = pd.read_csv(fileName_Y)
X_notime = X.drop('TimeStamp',axis=1)
Y_notime = Y.drop('TimeStamp',axis=1)
timeIndex_x=pd.to_datetime(X['TimeStamp'])
timeIndex_y=pd.to_datetime(Y['TimeStamp'])
X.index=timeIndex_x
Y.index=timeIndex_y
X_notime.index = timeIndex_x
Y_notime.index = timeIndex_y
X_features = [col for col in X_notime]
np.random.seed(0)

In [14]:
# change from dataframe to np array
X_npArray = X_notime.to_numpy()
Y_npArray = Y_notime.to_numpy()

X_Standard = ColumnStandardize(X_npArray)

# remove outlier with threshold 100
X_NoOutlier, Y_NoOutlier = OutlierRemoval(X_Standard, Y_npArray, 100)

X_FeatureSelection = TreeBasedSelection(X_NoOutlier, Y_NoOutlier, 16)



In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X_FeatureSelection, Y_NoOutlier, test_size=0.3, random_state=1)

19363
19363
[[-0.86472227 -1.11893714 -1.11894039 ... -0.91523354 -1.00006808
  59.3928    ]
 [-1.00679757 -0.60290643 -0.60290889 ... -0.58983232 -0.43823877
  58.3325    ]
 [-0.71412245 -0.63531416 -0.63531667 ... -0.60188422 -0.60784762
  57.6333    ]
 ...
 [-1.65750247 -1.72471232 -1.7247165  ... -1.56603597 -1.62550071
  52.4355    ]
 [-1.67739301 -1.74714844 -1.74715265 ... -1.51782838 -1.62550071
  52.6049    ]
 [-1.65466096 -1.71723362 -1.71723778 ... -1.54193218 -1.6042996
  52.1467    ]]


In [99]:
'''
Data Information
'''

X_shape = X_notime.shape
print("X_shape is " + str(X_shape))

X_shape is (3600, 12)


In [110]:
for i in range(3):
#     target_copy = data_Y.copy()
    del_list = []
    for j in range(3 + i):
        del_list.append(j)
    for k in range(3 -1 - i):
        del_list.append(10 - k -1)
    print(del_list)

[0, 1, 2, 9, 8]
[0, 1, 2, 3, 9]
[0, 1, 2, 3, 4]


In [116]:
A, B = create_data(X_notime, Y_notime, 3, 2)

[Timestamp('1970-01-01 00:00:01.409232371'), Timestamp('1970-01-01 00:00:01.409235971'), Timestamp('1970-01-01 00:00:01.409235970'), Timestamp('1970-01-01 00:00:01.409235969')]
(3596, 24)
[Timestamp('1970-01-01 00:00:01.409232371'), Timestamp('1970-01-01 00:00:01.409232372'), Timestamp('1970-01-01 00:00:01.409235971'), Timestamp('1970-01-01 00:00:01.409235970')]
(3596, 36)
(3596, 2)


In [117]:
Y_notime

Unnamed: 0_level_0,DispFrames
TimeStamp,Unnamed: 1_level_1
1970-01-01 00:00:01.409232371,18.510001
1970-01-01 00:00:01.409232372,14.390000
1970-01-01 00:00:01.409232373,13.390000
1970-01-01 00:00:01.409232374,16.659999
1970-01-01 00:00:01.409232375,22.049999
1970-01-01 00:00:01.409232376,19.120001
1970-01-01 00:00:01.409232377,14.170000
1970-01-01 00:00:01.409232378,13.000000
1970-01-01 00:00:01.409232379,16.049999
1970-01-01 00:00:01.409232380,21.659999


In [118]:
B

Unnamed: 0_level_0,DispFrames0,DispFrames1
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-01-01 00:00:01.409232374,16.659999,22.049999
1970-01-01 00:00:01.409232375,22.049999,19.120001
1970-01-01 00:00:01.409232376,19.120001,14.170000
1970-01-01 00:00:01.409232377,14.170000,13.000000
1970-01-01 00:00:01.409232378,13.000000,16.049999
1970-01-01 00:00:01.409232379,16.049999,21.659999
1970-01-01 00:00:01.409232380,21.659999,19.120001
1970-01-01 00:00:01.409232381,19.120001,14.170000
1970-01-01 00:00:01.409232382,14.170000,13.000000
1970-01-01 00:00:01.409232383,13.000000,16.659999


In [119]:
A

Unnamed: 0_level_0,runq-sz0,%%memused0,proc/s0,cswch/s0,all_%%usr0,ldavg-10,totsck0,pgfree/s0,plist-sz0,file-nr0,...,proc/s2,cswch/s2,all_%%usr2,ldavg-12,totsck2,pgfree/s2,plist-sz2,file-nr2,idel/s2,tps2
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-01 00:00:01.409232371,57,13.99,0.0,65212.0,96.58,63.17,439,168695.0,792,2640,...,14.0,63519.0,96.88,63.17,448,160585.0,810,2640,42.0,0.0
1970-01-01 00:00:01.409232372,69,13.99,4.0,67083.0,97.08,63.17,444,163626.0,796,2640,...,0.0,65632.0,97.54,63.17,448,156157.0,810,2640,14.0,0.0
1970-01-01 00:00:01.409232373,62,14.07,14.0,63519.0,96.88,63.17,448,160585.0,810,2640,...,21.0,62134.0,96.87,62.83,457,167108.0,827,2640,50.0,17.0
1970-01-01 00:00:01.409232374,78,14.07,0.0,65632.0,97.54,63.17,448,156157.0,810,2640,...,9.0,56859.0,97.04,62.83,462,156169.0,836,2640,31.0,0.0
1970-01-01 00:00:01.409232375,67,14.13,21.0,62134.0,96.87,62.83,457,167108.0,827,2640,...,29.0,60452.0,97.12,62.83,472,154543.0,855,2640,46.0,2.0
1970-01-01 00:00:01.409232376,77,14.16,9.0,56859.0,97.04,62.83,462,156169.0,836,2640,...,8.0,61419.0,97.33,62.83,475,161833.0,863,2640,26.0,0.0
1970-01-01 00:00:01.409232377,76,14.24,29.0,60452.0,97.12,62.83,472,154543.0,855,2640,...,10.0,63284.0,97.00,62.83,481,158087.0,873,2640,29.0,0.0
1970-01-01 00:00:01.409232378,67,14.25,8.0,61419.0,97.33,62.83,475,161833.0,863,2640,...,0.0,61254.0,97.21,63.89,481,164472.0,873,2640,18.0,11.0
1970-01-01 00:00:01.409232379,71,14.30,10.0,63284.0,97.00,62.83,481,158087.0,873,2640,...,21.0,62336.0,96.96,63.89,479,169497.0,873,2640,26.0,10.0
1970-01-01 00:00:01.409232380,84,14.32,0.0,61254.0,97.21,63.89,481,164472.0,873,2640,...,0.0,61240.0,97.50,63.89,479,166856.0,873,2640,13.0,0.0


In [120]:
X_notime

Unnamed: 0_level_0,runq-sz,%%memused,proc/s,cswch/s,all_%%usr,ldavg-1,totsck,pgfree/s,plist-sz,file-nr,idel/s,tps
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-01-01 00:00:01.409232371,57,13.99,0.0,65212.0,96.58,63.17,439,168695.0,792,2640,22.0,0.0
1970-01-01 00:00:01.409232372,69,13.99,4.0,67083.0,97.08,63.17,444,163626.0,796,2640,36.0,0.0
1970-01-01 00:00:01.409232373,62,14.07,14.0,63519.0,96.88,63.17,448,160585.0,810,2640,42.0,0.0
1970-01-01 00:00:01.409232374,78,14.07,0.0,65632.0,97.54,63.17,448,156157.0,810,2640,14.0,0.0
1970-01-01 00:00:01.409232375,67,14.13,21.0,62134.0,96.87,62.83,457,167108.0,827,2640,50.0,17.0
1970-01-01 00:00:01.409232376,77,14.16,9.0,56859.0,97.04,62.83,462,156169.0,836,2640,31.0,0.0
1970-01-01 00:00:01.409232377,76,14.24,29.0,60452.0,97.12,62.83,472,154543.0,855,2640,46.0,2.0
1970-01-01 00:00:01.409232378,67,14.25,8.0,61419.0,97.33,62.83,475,161833.0,863,2640,26.0,0.0
1970-01-01 00:00:01.409232379,71,14.30,10.0,63284.0,97.00,62.83,481,158087.0,873,2640,29.0,0.0
1970-01-01 00:00:01.409232380,84,14.32,0.0,61254.0,97.21,63.89,481,164472.0,873,2640,18.0,11.0
