In [13]:
import codecs
import re
import pandas as pd
import os.path
from multiprocessing import Pool

# Define constants

In [14]:
POINT = 'POINT'
STROKE = 'STROKE'
CHARACTER = 'CHARACTER'
SAMPLE = 'SAMPLE'
SIF = 'Serial_in_File'

# Define functions

## Level Zero: Route choosing

* Communicate upwards:
 * Get: start and end filenumber
 * Send: saved file with one file each number

* Communicate downwards: 
 * Object: choose the correct function 
 * Send: full strings and filenumber
 * Get: dataframe (with filenumber inside)

In [15]:
def pendoanalysis(start, end, pools):
    # set the file scope
    for i in range(start,end):
        if os.path.isfile('C:/Users/mengqili/Documents/txt_files/'+str(i)+'.txt'):
            
            # open the file
            file = open('C:/Users/mengqili/Documents/txt_files/'+str(i)+'.txt','r') 
            
            # get the string
            pendodata = file.read() 
            
            # then close the file
            file.close() 
            
            if len(pendodata) < 10:
                print ('file error')
            
            if len(re.findall(r'\[\[\[\[',pendodata)) == 4: # check whether there are FOUR '[[[[' in the string
                dfraw = stringprocessA(pendodata, i, pools) # There are FOUR '[[[[' in the string, go A route
            else:
                dfraw = stringprocessB(pendodata, i, pools)# There aren't FOUR '[[[[' in the string, go B route
            
            if len(dfraw) == 1:
                print ('Validation failed')
            else:
            
                # save the DataFrame
                savedf(dfraw,i)    # Save the dataframe
                print (str(i)+' has been processed')
            
        else:
            print ('Sample number '+str(i) + ' does NOT exist')

In [16]:
# Adjust the DataFrame and save it into a pkl file

def savedf(df,filenumber):
    
    if len(df.columns)  == 8:
        # set new columns' names: with SIF
        df.columns = ['X', 'Y', 'Z', POINT, STROKE, CHARACTER, SIF, SAMPLE]
        #Adjust sequence of columns before save to pickles
        df[[SAMPLE, CHARACTER, STROKE,'X', 'Y', 'Z', SIF]].to_pickle(str(filenumber)+".pkl")
        #to_hdf(str(filenumber)+'.h5', key='df', mode='w')

    elif len(df.columns) == 7 :
        # set new columns' names: without SIF
        df.columns = ['X', 'Y', 'Z', POINT, STROKE, CHARACTER, SAMPLE]
        #Adjust sequence of columns before save to pickles
        df = df[[SAMPLE, CHARACTER, STROKE,'X', 'Y', 'Z']].to_pickle(str(filenumber)+".pkl")

    else:
        print ('df columns are not 7 or 8')

## Level 1 functions

* Communicate upwards:
 * Get: full string
 * Send: dataframe

### Function 1: Character number Not in Text

* Communicate downwards: 
 * Object: the charactorprocessing function of the route
 * Send: 
 * Get: dataframe

In [33]:
# receive the string for one sample and the serial number of the sample;
# split the sample into strings of characters and use FUNCTION characterprocess to process each character
# return a dataframe for the sample
def stringprocessA(fullstring, serial, pools):
    
    print ('Processing '+str(serial)+' in Route A')
        
    
    charlist = re.findall(r'\[(\[\[.+?\]\])\]',
                re.findall(r'\[(\[\[\[.+?\]\]\])\]',fullstring)[0]
                )
    # search for '[[[[ * ]]]]' and return the first matching string without the outer [] 
    # then search for '[[[*]]]' and return a list of strings for each character
    
    
    # build pools
    p = Pool(pools)    
    # sent a list of characters to the function characterprocess and get a list of DataFrames
    #dflist = p.map(characterprocess, charlist)
    dflist = []
    for char in charlist:
        dflist.append(characterprocess(char))
    # Assign the serial of character by the sequence in the list
    counter = 0
    for singledf in dflist:
        singledf[CHARACTER] = counter
        counter+=1

    returndf = pd.concat(dflist)  

    # build a new column to store the sample number
    returndf[SAMPLE] = serial

    # validate the process
    if validation(fullstring, dflist) == 1:
        return returndf
    else:
        return 0






### Function 2 : Character number in Text

In [43]:
# changed 
# Changed RE to get character string instead of sample string
# should have 180 characters, only use the first 90 characters.

# Numberofpools controls how many threads shall be used simultaneously

def stringprocessB(fullstring ,filenumber, numberofpools):
    
    print ('Processing '+str(filenumber)+' in Route B')
    
    pendodata = re.findall(r"""(&quot;[0-9]+&quot;:\[\[\[.+?\]\]\])""",re.split('&quot;virtual&quot',fullstring)[0])
    
    # build pools
    p = Pool(numberofpools)
    
    # sent a list of characters to the function characterprocess and get a list of DataFrames
    #dflist = p.map(characterprocess, pendodata)
    dflist = []
    for da in pendodata:
        dflist.append(characterprocess(da))

    # set serial in the file to record the sequence in the html file
#    for i in range(90):
#        dflist[i][SIF] = i

    # the previous code will produce error when there is less than 90 characters,
    # Therefore, change the way to iterate the dflist
    counter = 0
    for singledf in dflist:
        print(singledf)
        singledf[SIF] = counter
        counter+=1

    # Concatenate all DataFrames and sort by the serial of character instead of the sequence in the html file 
    returndf = pd.concat(dflist).sort_values(by=[CHARACTER])
    
    # build a new column to store the sample number
    returndf[SAMPLE] = filenumber
    
    # validate the process
    if validation(fullstring, dflist) == 1:
        return returndf
    else:
        return 0

# Changed input: 
# no longer need serial input; get serial from re.findall instead



## Low level functions

In [41]:
##################################################################################################
# receive the string for one character;
# split the character into strings of strokes and use FUNCTION strokeprocess to process each stroke
# if there is a serial number in the string record it in the dataframe, if not do nothing;
# return a dataframe for the charactor

def characterprocess(characterstr):
    #normal process
    i = 0
    characterdata = pd.DataFrame(columns = [0,1,2,POINT,STROKE])
    for singlestroke in re.findall(r'\[(\[\d.+?\])\]',characterstr):
        characterdata = pd.concat([characterdata,strokeprocess(singlestroke,i)])
        i+=1
    
    # Check whether the input contains a serial
    
    if re.findall(r'&quot;(.+)&quot;:',characterstr):
        
        # if the search does not returns a NONE, the string contains a serial number of character
        characterdata[CHARACTER] = int(re.findall(r'&quot;(.+)&quot;:',characterstr)[0]) 
        
        #else the search returns a NONE, no process shall be performed,         
        #and the character serial shall be assigned later, in the FUNCTION stringprocess 
    return characterdata

##################################################################################################
# receive the string for one stroke and the serial number of the stroke;
# split the stroke into strings of points and use FUNCTION pointprocess to process each point
# return a dataframe for the stroke
def strokeprocess(strokestr, serial):
    i = 0
    strokedata = pd.DataFrame(columns = [0,1,2,POINT])
    for singlepoint in re.findall(r'\[(.+?)\]',strokestr):
        strokedata = pd.concat([strokedata,pointprocess(singlepoint,i)])
        i+=1
    strokedata[STROKE] = serial
    return strokedata



##################################################################################################
# receive the string for one point and the serial number of the point;
# return a dataframe for the point

def pointprocess(strokestr, serial):
    pointdata = pd.DataFrame(pd.to_numeric(re.split(',',strokestr))).T
    pointdata[POINT] = serial
    return pointdata



## Test function

Test whether the first number in returning df is the same as the first number in the string

In [20]:
##################################################################################################

def validation(fullstring, dflist): 
    #test in case not find the first three numbers.
    firstpoint = re.findall(r"""\[(\d.+?)\]""",fullstring)[0]
    
    if int(re.split(',',firstpoint)[0]) == dflist[0].iloc[0,0]:
        #return the final DataFrame when the first element of the list is correct
        return 1
    else:
        #print error message when the list is incorrect
        return 0 
        


# Use the functions:

In [44]:
# the first argument is start number
# the second argument is end number
# the third argument is how many character shall be processed simultaneously. 

pendoanalysis(109,110,1)


Processing 109 in Route B
       0      1     2 POINT STROKE  CHARACTER
0   4207  11066   854     0      0          0
0   4208  11065  1116     1      0          0
0   4209  11065  1235     2      0          0
0   4210  11064  1357     3      0          0
0   4214  11062  1456     4      0          0
0   4219  11060  1510     5      0          0
0   4225  11059  1549     6      0          0
0   4236  11057  1590     7      0          0
0   4251  11053  1626     8      0          0
0   4270  11047  1659     9      0          0
0   4292  11039  1687    10      0          0
0   4316  11028  1701    11      0          0
0   4340  11016  1710    12      0          0
0   4366  11005  1716    13      0          0
0   4393  10994  1717    14      0          0
0   4419  10985  1711    15      0          0
0   4444  10977  1693    16      0          0
0   4467  10969  1659    17      0          0
0   4490  10960  1604    18      0          0
0   4511  10951  1523    19      0          0
0   4530

[253 rows x 6 columns]
        0      1     2 POINT STROKE  CHARACTER
0   10282  13837   561     0      0         22
0   10282  13831   614     1      0         22
0   10282  13825   679     2      0         22
0   10283  13818   754     3      0         22
0   10284  13812   822     4      0         22
0   10285  13808   872     5      0         22
0   10287  13805   906     6      0         22
0   10288  13804   932     7      0         22
0   10290  13804   952     8      0         22
0   10292  13805   964     9      0         22
0   10294  13805   971    10      0         22
0   10296  13807   974    11      0         22
0   10298  13808   974    12      0         22
0   10299  13810   971    13      0         22
0   10301  13814   965    14      0         22
0   10303  13818   950    15      0         22
0   10305  13823   924    16      0         22
0   10307  13830   882    17      0         22
0   10309  13839   819    18      0         22
0   10311  13850   749    19      0  

       0      1     2 POINT STROKE  CHARACTER
0   5888  18544   422     0      0         46
0   5892  18536   550     1      0         46
0   5899  18524   725     2      0         46
0   5907  18513   932     3      0         46
0   5917  18502  1143     4      0         46
0   5927  18493  1341     5      0         46
0   5937  18485  1501     6      0         46
0   5946  18480  1639     7      0         46
0   5952  18477  1756     8      0         46
0   5957  18476  1845     9      0         46
0   5960  18476  1922    10      0         46
0   5963  18476  1986    11      0         46
0   5967  18476  2041    12      0         46
0   5972  18477  2047    13      0         46
0   5976  18479  2047    14      0         46
0   5980  18482  2047    15      0         46
0   5984  18484  2047    16      0         46
0   5986  18489  2047    17      0         46
0   5989  18493  2047    18      0         46
0   5991  18499  2047    19      0         46
0   5995  18507  2047    20      0

[297 rows x 6 columns]
        0      1     2 POINT STROKE  CHARACTER
0   11993  24396  1176     0      0         86
0   11993  24396  1327     1      0         86
0   11993  24397  1447     2      0         86
0   11994  24399  1559     3      0         86
0   11995  24402  1621     4      0         86
0   11997  24408  1661     5      0         86
0   11999  24419  1693     6      0         86
0   12003  24435  1710     7      0         86
0   12007  24454  1717     8      0         86
0   12012  24476  1713     9      0         86
0   12018  24498  1683    10      0         86
0   12026  24520  1619    11      0         86
0   12035  24540  1496    12      0         86
0   12045  24560  1299    13      0         86
0   12057  24578  1037    14      0         86
0   12069  24593   758    15      0         86
0   12083  24604   285    16      0         86
0   12237  24436   822     0      1         86
0   12234  24436  1061     1      1         86
0   12230  24436  1182     2      1  

109 has been processed


In [22]:
#test = pd.read_hdf('2.h5', 'df')

In [23]:
#test