In [None]:
### Import libraries
import json
import re
import numpy as np
import os
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
nltk.download('stopwords')

In [None]:
### Load data
PID = 1
dataset = 1
numBreakpoints = 11
startText = ['Arms', 'Terrorist', 'Disappearance']
filename = '../data/Dataset_' + str(dataset) + '/User Words/p' + str(PID) + '.csv'
breakpointsFile = '../data/Dataset_' + str(dataset) + '/Segmentation/' + startText[dataset - 1] + '_P' + str(PID) +'_'+str(numBreakpoints)+ '_Prov_Segments.csv'

print(">>> getting words in "+filename)
with open(filename, newline='') as f:
    reader = csv.reader(f)
    data = list(reader)
    f.close()
    print(len(data[0]),"words in vocabulary")
    print(len(data))
    
print(">>> getting the breakpoint values from segmentation:",breakpointsFile)
with open(breakpointsFile, newline='') as f2:
    breakpoints = np.genfromtxt(f2, delimiter=',',dtype=int,usecols=4,skip_header=1)
    f2.close()
    print(breakpoints)

In [None]:
### Aggregate histograms
print(np.shape(data))
#for each event
for i in range(len(data)):
    #Skip the first column (since this is the time of the event)
    if i != 0:    
        #for each word referenced in that event
        for j in range(len(data[i])):
            #make sure it is cast as a float value
            data[i][j] = float(data[i][j])

# print(len(data))
aggregateHist = []
currentDoc = 1
for segment in breakpoints:
    # print("for segment: "+str(segment))
    currentHist = data[currentDoc]
    numDocs = 1
    for docPos in range(currentDoc + 1, segment):
        for word in range(len(data[docPos])):
            currentHist[word] = float(currentHist[word]) + float(data[docPos][word])
            numDocs += 1
    # for wordPos in range(len(currentHist)):
    #     currentHist[wordPos] = float(currentHist[wordPos]) / float(numDocs)
    aggregateHist.append(currentHist)
    currentDoc = segment
print(len(aggregateHist))

In [None]:
### Identify top words
topFivesIndices = []
#for each segment
# print(aggregateHist[0])
for i in range(len(aggregateHist)):
    indexList = []
    #get a set of the 25 most unique words.
    for j in range(0, 25):
        #set the time of the event to 0 so it doesn't get flagged as the max
        aggregateHist[i][0] = 0 
        # Identify the word with the most frequency
        max_index = aggregateHist[i].index(max(aggregateHist[i]))
        #add the index of that word to a list
        indexList.append(max_index)
        #set the value to zero so we don't pick it again.
        aggregateHist[i][max_index] = 0
    #add the new set of words to the top five list.
    topFivesIndices.append(indexList)

topFives = []
for i in range(len(topFivesIndices)):
    wordList = []
    for j in range(len(topFivesIndices[i])):
        wordList.append(data[0][topFivesIndices[i][j]])
    topFives.append(wordList)
print(np.shape(topFives))
print(topFives[1])

In [None]:
### Remove non-words and stems
# Remove words with non-alpha characters
for i in range(len(topFives)):
    for j in range(len(topFives[i])):
        if topFives[i][j].isalpha() == False:
            topFives[i][j] = ""
        if len(topFives[i][j]) <= 2:
            topFives[i][j] = ""
        # if any(map(str.isdigit, topFives[i][j])) == True:
        #     # print(topFives[i][j])
        #     topFives[i][j] = ""
        #     # print(topFives[i][j])

# remove words with the same stem (e.g., textbook and textbooks)
# remove stop words
stop_words = set(stopwords.words('english'))
stop_words.add("intTime")
ps = PorterStemmer()
for i in range(len(topFives)):
    for j in range(len(topFives[i])):
        if topFives[i][j] in stop_words:
            topFives[i][j] = ""
        for k in range(len(topFives[i])):
            if topFives[i][j] != topFives[i][k] and ps.stem(topFives[i][j]) == ps.stem(topFives[i][k]):
                topFives[i][k] = ""

for i in range(len(topFives)):
    while "" in topFives[i]:
        topFives[i].remove("")

topFivesOnly = []
for i in range(len(topFives)):
    topFivesOnly.append(topFives[i][0:5])
print(topFivesOnly)

In [None]:
outFilename = '../data/Dataset_' + str(dataset) + '/segKeys/' + startText[dataset - 1] + '_P' + str(PID) +'_'+str(len(breakpoints))+ '_keys.csv'
os.makedirs(os.path.dirname(outFilename), exist_ok=True)
with open(outFilename, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(topFivesOnly)