In this project we will be applying different types of forecasting techniques on the given cpu data and analyise which one of them provides better accuracy and why.

We will be using folowing forecasting techniques:

1) Simple Moving Average

2) Weighted Moving average

3) Exponential Smoothning

4) Exponential Smoothning w/ trend

5) Linear Regression

6) Non-linear Regression

7) Neural Network

Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np
import cPickle
import time
import pyprind
import plotly.plotly as py
import plotly.graph_objs as go

Initialising required parameters

In [2]:
src_file = "D:\\course_related_data\\CSEE5690\\VM-CPU-Stats-1-Day.csv"
store_file = file('D:\\course_related_data\\CSEE5690\\cpu_analytics_'+time.strftime("%Y%m%d-%H%M%S")+'.pkl', 'wb')

Read Data from csv file

In [3]:
cpu_data = pd.read_csv(src_file)

In [4]:
cpu_data.describe()


Invalid value encountered in percentile



Unnamed: 0,Value,IntervalSecs,Instance
count,146004.0,146004.0,0.0
mean,298.213316,300.0,
std,628.43086,0.0,
min,3.0,300.0,
25%,48.0,300.0,
50%,106.0,300.0,
75%,296.0,300.0,
max,12986.0,300.0,


In [5]:
cpu_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146004 entries, 0 to 146003
Data columns (total 9 columns):
Value           146004 non-null int64
Timestamp       146004 non-null object
MetricId        146004 non-null object
Unit            146004 non-null object
Description     146004 non-null object
Entity          146004 non-null object
EntityId        146004 non-null object
IntervalSecs    146004 non-null int64
Instance        0 non-null float64
dtypes: float64(1), int64(2), object(6)
memory usage: 10.0+ MB


In [6]:
cpu_freq = cpu_data['Value']
E_id_list = cpu_data.Entity.unique()

In [7]:
weight_9 = np.array([1,2,3,4,5,4,3,2,1],dtype = np.float32)
weight_9 = weight_9/np.sum(weight_9)
weight_11 = np.array([1,2,3,4,5,6,5,4,3,2,1],dtype = np.float32)
weight_11 = weight_11/np.sum(weight_11)
weight_7 = np.array([1,2,3,4,3,2,1],dtype = np.float32)
weight_7 = weight_7/np.sum(weight_7)

In [8]:
trace1 = go.Scatter( y=weight_9, marker={'color': 'red', 'symbol': 104, 'size': "10"}, 
                    mode="markers+lines", name='Weight 9 array')

trace2 = go.Scatter( y=weight_11, marker={'color': 'green', 'symbol': 104, 'size': "10"}, 
                    mode="markers+lines", name='Weight 11 array')

trace3 = go.Scatter( y=weight_7, marker={'color': 'blue', 'symbol': 104, 'size': "10"}, 
                    mode="markers+lines", name='Weight 7 array')

data=go.Data([trace1, trace2, trace3])
layout=go.Layout(title="Three Different weight distribution for Weighted averaging")
figure=go.Figure(data=data,layout=layout)
py.iplot(figure)

# 1) Moving Average

In [9]:
mo_avg_7 = []
mo_avg_9 = []
mo_avg_11 = []
bar = pyprind.ProgBar(len(E_id_list), stream=1)
for E_id in E_id_list:
    id_list = cpu_data[cpu_data['Entity'] == E_id].index.tolist()
    for window in [7,9,11]:
        x = np.empty(len(id_list)-window)
        counter = 0
        for i in id_list[window:]:
            x[counter] = np.mean(cpu_freq[i-window:i])
            counter += 1
        if window == 7:
            mo_avg_7.append(x)
        elif window == 9:
            mo_avg_9.append(x)
        elif window == 11:
            mo_avg_11.append(x)
    bar.update()

cPickle.dump((mo_avg_7, mo_avg_9, mo_avg_11), store_file, protocol=cPickle.HIGHEST_PROTOCOL)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:48


# 2) Weighted Moving Average

In [10]:
wei_mo_avg_7 = []
wei_mo_avg_9 = []
wei_mo_avg_11 = []
bar = pyprind.ProgBar(len(E_id_list), stream=1)
for E_id in E_id_list:
    id_list = cpu_data[cpu_data['Entity'] == E_id].index.tolist()
    for window in [7,9,11]:
        x = np.empty(len(id_list)-window)
        counter = 0
        if window == 7:
            weight_ = weight_7
        elif window == 9:
            weight_ = weight_9
        elif window == 11:
            weight_ = weight_11
        
        for i in id_list[window:]:
            x[counter] = np.sum(cpu_freq[i-window:i] * weight_)
            counter += 1
        if window == 7:
            wei_mo_avg_7.append(x)
        elif window == 9:
            wei_mo_avg_9.append(x)
        elif window == 11:
            wei_mo_avg_11.append(x)
    bar.update()

cPickle.dump((wei_mo_avg_7, wei_mo_avg_9, wei_mo_avg_11), store_file, protocol=cPickle.HIGHEST_PROTOCOL)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:01:22


# 3) Exponential Smoothing

In [11]:
ES_pred = []
bar = pyprind.ProgBar(9*len(E_id_list), stream=1)
for a in range(1,10):
    E_id_array = []
    for E_id in E_id_list:
        id_list = cpu_data[cpu_data['Entity'] == E_id].index.tolist()
        x = np.empty(len(id_list))
        x[0] = cpu_freq[id_list[0]]
        counter = 1
        for i in id_list[1:]:
            x[counter] = x[counter-1] + (a/10.0) * (cpu_freq[i-1] - x[counter-1])
            counter += 1
        E_id_array.append(x)
        bar.update()
    
    ES_pred.append(E_id_array)

cPickle.dump(ES_pred, store_file, protocol=cPickle.HIGHEST_PROTOCOL)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:46


# 4) Exponential Smoothing with trend

In [12]:
EST_pred = []
bar = pyprind.ProgBar(9*9*len(E_id_list), stream=1)
for a in range(1,10):
    for d in range(1,10):
        E_id_array = []
        for E_id in E_id_list:
            id_list = cpu_data[cpu_data['Entity'] == E_id].index.tolist()
            x = np.empty(len(id_list))
            x[0] = cpu_freq[id_list[0]]
            T = 0.0
            F = x[0]
            counter = 1
            for i in id_list[1:]:
                F = x[counter - 1] + a/10.0 * (cpu_freq[i-1] - x[counter-1])
                T = T + d/10.0 * (F - x[counter-1])
                x[counter] = F + T
                counter += 1
            E_id_array.append(x)
            bar.update()
        EST_pred.append(E_id_array)

cPickle.dump(EST_pred, store_file, protocol=cPickle.HIGHEST_PROTOCOL)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:06:49


In [13]:
store_file.close()

In [12]:
f = file('D:\\course_related_data\\CSEE5690\\cpu_analytics_20160831-143731.pkl', 'rb')
value1 = cPickle.load(f)
value2 = cPickle.load(f)
f.close()