In [None]:
import codecs
import re
import pandas as pd
import os.path
from multiprocessing import Pool

# Define constants

In [None]:
POINT = 'POINT'
STROKE = 'STROKE'
CHARACTER = 'CHARACTER'
SAMPLE = 'SAMPLE'
SIF = 'Serial_in_File'
PATH = '' #'C:/Users/mengqili/Documents/txt_files/'

# Define functions

## Level Zero: Route choosing

* Communicate upwards:
 * Get: start and end filenumber
 * Send: saved file with one file each number

* Communicate downwards: 
 * Object: choose the correct function 
 * Send: full strings and filenumber
 * Get: dataframe (with filenumber inside)

In [None]:
def pendoanalysis(start, end):
    # set the file scope
    for i in range(start,end):
        if os.path.isfile(PATH +str(i)+'.txt'):
            
            # open the file
            file = open(PATH + str(i)+'.txt','r') 
            
            # get the string
            pendodata = file.read() 
            
            # then close the file
            file.close() 
            
            if len(pendodata) < 10:
                print ('file error')
            
            if len(re.findall(r'\[\[\[\[',pendodata)) == 4: # check whether there are FOUR '[[[[' in the string
                dfraw = stringprocessA(pendodata, i) # There are FOUR '[[[[' in the string, go A route
            else:
                dfraw = stringprocessB(pendodata, i)# There aren't FOUR '[[[[' in the string, go B route
            
            if len(dfraw) == 1:
                print ('Validation failed')
            else:
            
                # save the DataFrame
                savedf(dfraw,i)    # Save the dataframe
                print (str(i)+' has been processed')
            
        else:
            print ('Sample number '+str(i) + ' does NOT exist')

In [None]:
# Adjust the DataFrame and save it into a pkl file

def savedf(df,filenumber):
    
    if len(df.columns)  == 8:
        # set new columns' names: with SIF
        df.columns = ['X', 'Y', 'Z', POINT, STROKE, CHARACTER, SIF, SAMPLE]
        #Adjust sequence of columns before save to pickles
        df[[SAMPLE, CHARACTER, STROKE, POINT,'X', 'Y', 'Z', SIF]].to_pickle(str(filenumber)+".pkl")
        #to_hdf(str(filenumber)+'.h5', key='df', mode='w')

    elif len(df.columns) == 7 :
        # set new columns' names: without SIF
        df.columns = ['X', 'Y', 'Z', POINT, STROKE, CHARACTER, SAMPLE]
        #Adjust sequence of columns before save to pickles
        df = df[[SAMPLE, CHARACTER, STROKE, POINT,'X', 'Y', 'Z']].to_pickle(str(filenumber)+".pkl")

    else:
        print ('df columns are not 7 or 8')

## Level 1 functions

* Communicate upwards:
 * Get: full string
 * Send: dataframe

### Function 1: Character number Not in Text

* Communicate downwards: 
 * Object: the charactorprocessing function of the route
 * Send: 
 * Get: dataframe

In [None]:
# receive the string for one sample and the serial number of the sample;
# split the sample into strings of characters and use FUNCTION characterprocess to process each character
# return a dataframe for the sample
def stringprocessA(fullstring, filenumber):
    
    print ('Processing '+str(filenumber)+' in Route A')
        
    
    charlist = re.findall(r'\[(\[\[.+?\]\])\]',
                re.findall(r'\[(\[\[\[.+?\]\]\])\]',fullstring)[0]
                )
    # search for '[[[[ * ]]]]' and return the first matching string without the outer [] 
    # then search for '[[[*]]]' and return a list of strings for each character
    
    
    dflist = []
    for char in charlist:
        dflist.append(characterprocess(char))
    # Assign the serial of character by the sequence in the list
    counter = 0
    for singledf in dflist:
        singledf[CHARACTER] = counter
        counter+=1

    returndf = pd.concat(dflist)  

    # build a new column to store the sample number
    returndf[SAMPLE] = filenumber

    # validate the process
    if validation(fullstring, dflist) == 1:
        return returndf
    else:
        return 0






### Function 2 : Character number in Text

In [None]:
# changed 
# Changed RE to get character string instead of sample string
# should have 180 characters, only use the first 90 characters.

# Numberofpools controls how many threads shall be used simultaneously

def stringprocessB(fullstring ,filenumber):
    
    print ('Processing '+str(filenumber)+' in Route B')
    
    pendodata = re.findall(r"""(&quot;[0-9]+&quot;:\[\[\[.+?\]\]\])""",re.split('&quot;virtual&quot',fullstring)[0])
    
    dflist = []
    for da in pendodata:
        dflist.append(characterprocess(da))

    # set serial in the file to record the sequence in the html file
#    for i in range(90):
#        dflist[i][SIF] = i

    # the previous code will produce error when there is less than 90 characters,
    # Therefore, change the way to iterate the dflist
    counter = 0
    for singledf in dflist:
        print(singledf)
        singledf[SIF] = counter
        counter+=1

    # Concatenate all DataFrames and sort by the serial of character instead of the sequence in the html file 
    returndf = pd.concat(dflist).sort_values(by=[CHARACTER])
    
    # build a new column to store the sample number
    returndf[SAMPLE] = filenumber
    
    # validate the process
    if validation(fullstring, dflist) == 1:
        return returndf
    else:
        return 0

# Changed input: 
# no longer need serial input; get serial from re.findall instead



## Low level functions

In [None]:
##################################################################################################
# receive the string for one character;
# split the character into strings of strokes and use FUNCTION strokeprocess to process each stroke
# if there is a serial number in the string record it in the dataframe, if not do nothing;
# return a dataframe for the charactor

def characterprocess(characterstr):
    #normal process
    i = 0
    characterdata = pd.DataFrame(columns = [0,1,2,POINT,STROKE])
    for singlestroke in re.findall(r'\[(\[\d.+?\])\]',characterstr):
        characterdata = pd.concat([characterdata,strokeprocess(singlestroke,i)])
        i+=1
    
    # Check whether the input contains a serial
    
    if re.findall(r'&quot;(.+)&quot;:',characterstr):
        
        # if the search does not returns a NONE, the string contains a serial number of character
        characterdata[CHARACTER] = int(re.findall(r'&quot;(.+)&quot;:',characterstr)[0]) 
        
        #else the search returns a NONE, no process shall be performed,         
        #and the character serial shall be assigned later, in the FUNCTION stringprocess 
    return characterdata

##################################################################################################
# receive the string for one stroke and the serial number of the stroke;
# split the stroke into strings of points and use FUNCTION pointprocess to process each point
# return a dataframe for the stroke
def strokeprocess(strokestr, strokenumber):
    i = 0
    strokedata = pd.DataFrame(columns = [0,1,2,POINT])
    for singlepoint in re.findall(r'\[(.+?)\]',strokestr):
        strokedata = pd.concat([strokedata,pointprocess(singlepoint,i)])
        i+=1
    strokedata[STROKE] = strokenumber
    return strokedata



##################################################################################################
# receive the string for one point and the serial number of the point;
# return a dataframe for the point

def pointprocess(pointstr, pointnumber):
    pointdata = pd.DataFrame(pd.to_numeric(re.split(',',pointstr))).T
    pointdata[POINT] = pointnumber
    return pointdata



## Test function

Test whether the first number in returning df is the same as the first number in the string

In [None]:
##################################################################################################

def validation(fullstring, dflist): 
    #test in case not find the first three numbers.
    firstpoint = re.findall(r"""\[(\d.+?)\]""",fullstring)[0]
    
    if int(re.split(',',firstpoint)[0]) == dflist[0].iloc[0,0]:
        #return the final DataFrame when the first element of the list is correct
        return 1
    else:
        #print error message when the list is incorrect
        return 0 
        


# Use the functions:

In [None]:
# the first argument is start number
# the second argument is end number
# the third argument is how many character shall be processed simultaneously. 

pendoanalysis(109,110)


In [None]:
#test = pd.read_hdf('2.h5', 'df')

# Test

In [None]:
df = pd.read_pickle('109.pkl')
Char13 = df[df.CHARACTER == 13]
Stroke5 = Char13[Char13.STROKE == 5]
Stroke5