# Binary Segmentation - converts a frequency histogram CSV into time segments

In [61]:
### Import necessary modules
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt
import os
import csv

In [62]:
##### Set parameters
PID = 'p4'
dataset = '3'

n, dim = 500, 3  # number of samples
n_bkps, sigma = 3, 5  # number of change points, noise standard deviation
signal, bkps = rpt.pw_constant(n, dim, n_bkps, noise_std=sigma)

In [63]:
### Open file
file = open("../data/Dataset_" + dataset + "/User Words/" + PID + ".csv")
#load in the words CSV, but 
#drop the header row, we don't
#care about the word iteslf, 
#just the likelihood of finding 
#the word in a document.
signal = np.loadtxt(file, delimiter=",",skiprows=1) 
print(np.shape(signal))
#(numEvents, numWordsInVocab)

In [64]:
#Arbitrary breakpoint adjustment function. Could be modified in future
if (len(signal[0]) > 500 ):
    number_of_breakpoints = 11
elif (len(signal[0]) > 100): 
    number_of_breakpoints = 6
else:
    number_of_breakpoints = 3
print("segmenting into",number_of_breakpoints,"segments")

In [None]:
### Generate segments
# change point detection
model = "l2"  # "l1", "rbf", "linear", "normal", "ar"
algo = rpt.Binseg(model=model, jump=1).fit(signal)
my_bkps = algo.predict(n_bkps=number_of_breakpoints)

# show results
print(my_bkps)

In [None]:
### Convert breakpoint indices into times:
timeArr = []
for bkp in my_bkps:
    #go to the signal matrix, and pull the value from the first column (time of event) and append to timeArr
    timeArr.append(signal[bkp-1][0] / 10)
print(timeArr)

In [None]:
### Export segments to CSV
startText = ['Arms', 'Terrorist', 'Disappearance']
outFilename = '../data/Dataset_' + dataset + '/Segmentation/' + startText[int(dataset) - 1] + '_P' + PID[1] + '_'+str(number_of_breakpoints)+'_Prov_Segments.csv'
os.makedirs(os.path.dirname(outFilename), exist_ok=True)
f = open(outFilename, 'w', newline='')

writer = csv.writer(f)
header = ['id', 'start', 'end', 'length','bkps']
writer.writerow(header)

row = [0, 0, timeArr[0], timeArr[0], my_bkps[0]]
writer.writerow(row)
for i in range(1,number_of_breakpoints):
    row = [i, timeArr[i-1], timeArr[i], timeArr[i] - timeArr[i-1],my_bkps[i]]
    writer.writerow(row)
f.close()
