In [1]:
import nag4py.g13 as g13
import pandas as pd
import numpy as np
import nag4py.util as util
import matplotlib.pyplot as plt
import nag4py.e01 as e01

###Read in data from database file

In [2]:
data = pd.read_json('yahoo.json')

###Set the price and the date

In [3]:
price = data.Close.sort_index().values
date = data.index.order()

###Determine all the dates between the first and last piece of data

In [4]:
all_dates = pd.date_range(date[0], date[-1])

###Reindex the data to include the missing dates

In [5]:
all_data = data.reindex(all_dates)

###Determine the missing dates

In [6]:
missing_dates = all_dates.difference(date)

###Define a function to reorder the data so that the points closest to xi are first

In [7]:
def reorder(x, y, xi):
    distance = abs(x - xi)
    distance_sort = distance.argsort()
    sorted_x = []
    sorted_y = []
    stop_next = False
    while True:
        #Use only 14 points
        for i in range(0, 14):
            if distance_sort[i] == 0:
                if not pd.isnull(y[distance_sort[i]]):
                    sorted_x.append(x[distance_sort[i]])
                    sorted_y.append(y[distance_sort[i]])             
                if len(sorted_x) > 1:
                    break
                else:
                    stop_next = True
            elif distance_sort[i] == len(x)-1:
                if not pd.isnull(y[distance_sort[i]]):
                    sorted_x.append(x[distance_sort[i]])
                    sorted_y.append(y[distance_sort[i]])
                if len(sorted_x) > 1:
                    break
                else:
                    stop_next = True
            else:
                if not pd.isnull(y[distance_sort[i]]):
                    sorted_x.append(x[distance_sort[i]])
                    sorted_y.append(y[distance_sort[i]])
                if stop_next:
                    break
        break
    #return only 4 points to reduce oscillations
    return sorted_x[0:4], sorted_y[0:4]

###Define a function to interpolate the data at the missing points

In [8]:
def interpolate(x, y, xi):
    x, y = reorder(x, y, xi)
    n = len(x)-1
    a = np.array(x, dtype=float)
    b = np.array(y, dtype=float)
    c = np.zeros(n * (n+1)/2)
    fail = util.noisy_fail()
    e01.nag_1d_aitken_interp(n, a, b, c, xi, fail)
    return c[-1]

###Convert datetime to float and save the missing prices to a list

In [9]:
c = []
x = date.values.astype(float)
for i in range(0,len(missing_dates.values)):
    xi = missing_dates.values.astype(float)[i]
    c.append(interpolate(x, price, xi))

###Insert the missing values into the data

In [10]:
for row in all_data.iterrows():
    if pd.isnull(row[1]['Close']):
        row[1]['Close'] = c.pop(0)

###Plot the difference between the original and modified datasets

In [11]:
fig = plt.figure()
plt.plot(date, price)
plt.plot(all_data.index, all_data['Close'], 'x')
plt.show()

###Calculate the changepoints for the modified dataset

In [12]:
ctype = g13.Nag_TS_ChangeType()
ctype = 2991
n = len(all_data['Close'])
beta = 1 * np.log(n)
minss = 2
param = np.ndarray(1)
param[0] = 0.1
ntau = np.zeros(n, dtype=int)
tau = np.zeros(n, dtype=int)
sparam = np.zeros(2*n+2)
fail = util.noisy_fail()
g13.nag_tsa_cp_pelt(ctype, n, np.array(all_data['Close'].values), beta, minss, param, ntau, tau, sparam, fail)

###Get the changepoints, mean and std

In [13]:
m = ntau[0]
change_points = [0]
mean = []
std = []
for i in range(0, m):
    change_points.append(tau[i])
for i in range(0, m):
    mean.append(sparam[2*(i+1)-2])
    std.append(sparam[2*(i+1)-1])
change_points.pop()
change_points_std = change_points
change_points_std.append(len(all_data['Close'].values)-1)
std.append(std[-1])
mean.append(mean[-1])

###Plot the results

In [14]:
plt.plot(all_dates, all_data['Close'].values, 'g')
for change_point in change_points:
    plt.axvline(all_dates[change_point], color = 'r')
plt.step(all_dates[change_points_std], std, where='post')
plt.show()