In [29]:
import xgboost
import os
import xgboost_util_py3 as xgboost_util
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

random.seed(0)

NUMBER_OF_TREES = 50
WINDOW_SIZE = 5

#TEST_NAME = 'PageRank'
#TEST_NAME = 'KMeans'
TEST_NAME = 'SGD'
#TEST_NAME = 'tensorflow'
#TEST_NAME = 'web_server'

TARGET_COLUMN = 'flow_size'

TRAINING_PATH = '../data/ml/' + TEST_NAME + '/training/'
TEST_PATH = '../data/ml/' + TEST_NAME + '/test/'
VALIDATION_PATH = '../data/ml/' + TEST_NAME + '/validation/'

training_files = [os.path.join(TRAINING_PATH, f) for f in os.listdir(TRAINING_PATH)]
test_files = [os.path.join(TEST_PATH, f) for f in os.listdir(TEST_PATH)]
validation_files = [os.path.join(VALIDATION_PATH, f) for f in os.listdir(VALIDATION_PATH)]


In [30]:
def calculate_scaling(training_paths):
    scaling = {}
    #calculate scaling factors
    for f in training_paths:
        df = pd.read_csv(f, index_col=False)

        for column in df.columns:
            if column not in scaling:
               scaling[column] = 0.
            scaling[column] = max(scaling[column], float(df[column].max().split()[-1]))
    return scaling


def prepare_files(files, window_size, scaling, target_column='flow_size'):
    result = []

    for f in files:
        df = pd.read_csv(f, index_col=False)

        print(df)
        df = df.drop("index", axis=1)
        

        df = df.apply((lambda x: resize(x, scaling)), axis=0)
        flow_size = df[target_column]
        df[target_column] = flow_size
        #extend the window
        columns = list(df)
        final_df = df.copy()
        for sample_num in range(1, window_size):
            shifted = df.shift(sample_num)
            shifted.columns = map(lambda x: x+str(sample_num), shifted.columns)
            final_df = concat([shifted, final_df], axis=1)

        final_df = final_df.fillna(0)
        final_df = final_df.drop(target_column, axis=1)

        result.append((final_df, flow_size))

    return result


scaling = calculate_scaling(training_files)
#prepare_files(training_files, WINDOW_SIZE, scaling, TARGET_COLUMN)

In [31]:
#scaling = xgboost_util.calculate_scaling(training_files)
#data = xgboost_util.prepare_files(training_files, WINDOW_SIZE, scaling, TARGET_COLUMN)
data = scaling

inputs, outputs = xgboost_util.make_io(data)

# fit model no training data
param = {
    'num_epochs' : NUMBER_OF_TREES,
    'max_depth' : 10,
    'objective' : 'reg:linear',
    'booster' : 'gbtree',
    'base_score' : 2,
    'silent': 1,
    'eval_metric': 'mae'
}

training = xgboost.DMatrix(inputs, outputs, feature_names = data[0][0].columns)
print(len(outputs))
print('Training started')
model = xgboost.train(param, training, param['num_epochs'])

AttributeError: 'str' object has no attribute 'as_matrix'

In [None]:
def print_performance(files, write_to_simulator=False):
    real = []
    predicted = []
    for f in files:
        data = xgboost_util.prepare_files([f], WINDOW_SIZE, scaling, TARGET_COLUMN)
        inputs, outputs = xgboost_util.make_io(data)

        y_pred = model.predict(xgboost.DMatrix(inputs, feature_names = data[0][0].columns))
        pred = y_pred.tolist()

        real += outputs
        predicted += pred

    xgboost_util.print_metrics(real, predicted)

print 'TRAINING'
print_performance(training_files)
print

print 'TEST'
print_performance(test_files)
print

print 'VALIDATION'
print_performance(validation_files)