In [22]:
from subprocess import Popen, PIPE
import json
import os
from glob import glob
import time

In [None]:
def runShellCmdBlocking(cmd):

    process = Popen(cmd, shell = True, stdout = PIPE, stderr = PIPE)
    
    (output, err) = process.communicate()
    
    exitCode = process.wait()
     
    # in python3, output type is bytes, in python2, it is string
    if output is not None: output = output.decode("utf-8")
    if err is not None: err = err.decode("utf-8")

    return (exitCode, output, err)  

In [3]:
### command example ###
# time ./run.sh import ~/ingester/resource/ingest-pto-cql-es-server.properties 
# /data/uspto/cleaned/nodes ~/ingester/resource/uspto_schema.json 
# ~/ingester/resource/uspto_datamapper.json 2>&1 | tee ~/log/uspto/ingest.log

def generateTemplateAndSubmit(lookupTable, elementKey, dataMapper, templateFilepath, 
                            ingesterHome, janusgraphHome, graphConfig, schema, logDir):
    
    print('Processing {}'.format(elementKey))
    
    elementMap = dataMapper[elementKey]
    
    # handling nodes/edges:
    for element in lookupTable:
        print('Processing {}'.format(element))

        folder = lookupTable[element]
        print('Search folder {}'.format(folder))

        files = glob(os.path.join(folder, '*.csv'))
        print('Found {} files'.format(len(files)))

        key = "<" + element.upper() + "_FILE_PLACEHOLDER>"
        print('Using key {}'.format(key))

        fieldMapping = elementMap[key]

        for file in files:
            dataMapDict = {}
            dataMapDict[elementKey] = {}
            
            basename = os.path.basename(file)

            dataMapDict[elementKey][basename] = fieldMapping
         
            print('Writing to {}'.format(templateFilepath))
            
            with open(templateFilepath, 'w') as f:
                json.dump(dataMapDict, f)

            print('Generated template at {}'.format(templateFilepath))
            
            
            logFile = element + '_' + os.path.splitext(basename)[0] + '.log'
            
            logFilepath = os.path.join(logDir, logFile)
            #submit job
            cmdList = [
                'cd ' + ingesterHome,
                'export JANUSGRAPH_HOME=' + janusgraphHome,
                'time ./run.sh import ' + graphConfig + ' ' + \
                    folder + ' ' + schema + ' ' + \
                    templateFilepath + ' 2>&1 | tee ' + logFilepath]
      
            commands = ';'.join(cmdList)
            
            print(commands)
            

def mergeData(inputdir, outputdir):
    
    globPattern = inputdir + os.sep + "*" + os.sep
    
    folders = [x for x in os.listdir(inputdir) if os.path.isdir(os.path.join(inputdir, x))]
    
    for folder in folders:
            
        print('Processing folder {}'.format(folder))

        start = time.time()
        
        outfileName = folder + '.csv'
        
        files = glob(os.path.join(inputdir, folder, "*.csv"))
        print('Found {} files'.format(len(files)))
        
        files.sort()
        
        outfilepath = os.path.join(outputdir, outfileName)
        with open(outfilepath, 'w') as outfile:
            
            totalNumRows = 0
            
            for i in range(len(files)):
                
#                 print('Processing file {}'.format(files[i]))

                lineOffset = 0
                
                with open(files[i], 'r') as infile:
                    
                    for line in infile:
                        line = line.strip()
                        line = '\t'.join(line.split(','))
                        
                        if lineOffset == 0:
                            if i == 0:
                                
                                outfile.writelines(line + '\n')   
                        else:
                            outfile.writelines(line + '\n')
                            
                        lineOffset += 1
                        
                totalNumRows += lineOffset - 1
                
            print('{} rows in outfile {}'.format(totalNumRows, outfilepath))
                        
        
        print(" ------- {} seconds --------".format(time.time() - start))
        


In [6]:

# inputdir = '/N/project/mag/mag_jg_2021_update/nodes'
# outputdir = '/N/project/rds/CADRE/dataset/mag/2021/combined/nodes'
inputdir = '/N/project/mag/wos_csv/edges'
outputdir = '/N/project/rds/CADRE/dataset/wos/edges'
mergeData(inputdir, outputdir)

# inputdir = '/N/project/mag/mag_jg_2021_update/edges'
# outputdir = '/N/project/rds/CADRE/dataset/mag/2021/combined/edges'
# mergeData(inputdir, outputdir)

Processing folder refEdges
Found 100 files
1169338266 rows in outfile /N/project/rds/CADRE/dataset/wos/edges/refEdges.csv
 ------- 8657.893625736237 seconds --------


In [58]:
def sanityCheckNumCols(folder, delimiter, logDir):

    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    
    print('Processing folder {}'.format(folder))
    
    for filename in files:
          
        logFile = os.path.join(logDir, 'sc-' + os.path.basename(filename) + '.log')
        with open(logFile, 'w') as logfp:
            
            with open(os.path.join(folder, filename), 'r') as fp:
                logfp.write('Processing file {}\n'.format(filename))
                print('Processing file {}'.format(filename))
                
                start = time.time()

                recordCnt = 0
                numInvalidRecords = 0
                headerSet = False

                for line in fp:

                    ## Note the strip() will remove all trailing whitespace characters like
                    ## \t\t\t\n if parameter not given, so we need to specify explicitly
                    ## that we only want to remove new line character
                    line = line.strip('\n')

                    if not headerSet:
                        logfp.write('Header:\n{}\n'.format(line))
                        print('Header:\n{}'.format(line))

                        numCols = len(line.split(delimiter))
                        logfp.write('# of columns: {}\n'.format(numCols))
                        print('# of columns: {}'.format(numCols))

                        headerSet = True

                    else:

                        recordCnt += 1

                        fields = line.split(delimiter)
                        numFields = len(fields)
                        if numFields != numCols:
                            numInvalidRecords += 1 
                            
#                             i = 0
#                             for field in fields:
#                                 i = i + 1
#                                 print('Field {}:{}'.format(i, field))

                            logfp.write('line # {} below, expected # cols {}, actual got {}\n{}\n'.\
                                  format(recordCnt + 1, numCols, numFields, line))
                            print('line # {} below, expected # cols {}, actual got {}\n{}'.\
                                  format(recordCnt + 1, numCols, numFields, line))
                            
#                     if recordCnt >= 1:
#                         break

                logfp.write('Finished processing file {}\n'.format(filename))
                print('Finished processing file {}, '.format(filename))
                logfp.write('# of invalid records: {}/{}\n'.format(numInvalidRecords, recordCnt))
                print('# of invalid records: {}/{}'.format(numInvalidRecords, recordCnt))
                print(" ------- {} seconds --------".format(time.time() - start))
                    
                 
def sanityCheckColLength(folder, delimiter, colIdx, sizeThreshold, logDir):

    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    
    print('Processing folder {}'.format(folder))
    
    for filename in files:
          
        logFile = os.path.join(logDir, 'sc-' + os.path.basename(filename) + '.log')
        with open(logFile, 'w') as logfp:
            
            with open(os.path.join(folder, filename), 'r') as fp:
                logfp.write('Processing file {}\n'.format(filename))
                print('Processing file {}'.format(filename))
                
                start = time.time()

                recordCnt = 0
                numInvalidRecords = 0
                headerSet = False

                for line in fp:

                    ## Note the strip() will remove all trailing whitespace characters like
                    ## \t\t\t\n if parameter not given, so we need to specify explicitly
                    ## that we only want to remove new line character
                    line = line.strip('\n')

                    if not headerSet:
                        logfp.write('Header:\n{}\n'.format(line))
                        print('Header:\n{}'.format(line))

                        numCols = len(line.split(delimiter))
                        logfp.write('# of columns: {}\n'.format(numCols))
                        print('# of columns: {}'.format(numCols))

                        headerSet = True

                    else:

                        recordCnt += 1

                        fields = line.split(delimiter)
                        
                        targetField = fields[colIdx]
                        
                        size = len(targetField.encode('utf-8'))
                        if size >= sizeThreshold:
                            numInvalidRecords += 1                           

                            logfp.write('line # {}, lc_standard_names {}, size {}\n'.\
                                  format(recordCnt + 1, targetField, size))
#                             print('line # {}, wosid {}, standardnames {}, lc_standard_names {}, size {}\n'.\
#                                   format(recordCnt + 1, fields[0], fields[-15],targetField, size))
#                             print('line # {}, wosid {}, standardnames {}, size {}\n'.\
#                                   format(recordCnt + 1, fields[0], fields[-15], size))
                            
#                             break
#                     if recordCnt >= 1:
#                         break

                logfp.write('Finished processing file {}\n'.format(filename))
                print('Finished processing file {}, '.format(filename))
                logfp.write('# of invalid records: {}/{}\n'.format(numInvalidRecords, recordCnt))
                print('# of invalid records: {}/{}'.format(numInvalidRecords, recordCnt))
                print(" ------- {} seconds --------".format(time.time() - start))
                
                
def filterByLength(folder, delimiter, colIdx, sizeThreshold, outDir):

    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    
    print('Processing folder {}'.format(folder))
    
    for filename in files:
        
        filenameWithoutExt = os.path.splitext(filename)[0]
        remainFile = os.path.join(outDir, filenameWithoutExt + '_filtered.tsv')
        exludeFile = os.path.join(outDir, filenameWithoutExt + '_excluded.tsv')
        
        with open(remainFile, 'w') as fpRemain:
            with open(exludeFile, 'w') as fpExclude:
                with open(os.path.join(folder, filename), 'r') as fp:
                  
                    print('Processing file {}'.format(filename))

                    start = time.time()

                    recordCnt = 0
                    numInvalidRecords = 0
                    headerSet = False

                    for line in fp:

                        ## Note the strip() will remove all trailing whitespace characters like
                        ## \t\t\t\n if parameter not given, so we need to specify explicitly
                        ## that we only want to remove new line character
                        line = line.strip('\n')

                        if not headerSet:
                            print('Header:\n{}'.format(line))

                            numCols = len(line.split(delimiter))
                            print('# of columns: {}'.format(numCols))

                            headerSet = True

                            fpRemain.write(line + '\n')
                            fpExclude.write(line + '\n')
                        else:

                            recordCnt += 1

                            fields = line.split(delimiter)

                            targetField = fields[colIdx]

                            size = len(targetField.encode('utf-8'))
                            if size >= sizeThreshold:
                                numInvalidRecords += 1                           

                                fpExclude.write(line + '\n')
                            else:
                                fpRemain.write(line + '\n')


                print('Finished processing file {}, '.format(filename))

                print('# of invalid records: {}/{}'.format(numInvalidRecords, recordCnt))
                print(" ------- {} seconds --------".format(time.time() - start))

In [43]:
from platform import python_version

print(python_version())

3.6.0


In [40]:
folderList = [
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/nodes',
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/edges'
#     '/N/project/mag/wos_csv'
#    '/N/project/mag/wos_csv/tmp/'
    '/N/project/mag/wos_csv/fromPostgres'
]

delimiter = '\t'
logDir = 'output'

for folder in folderList:
    sanityCheckNumCols(folder, delimiter, logDir)

Processing folder /N/project/mag/wos_csv/fromPostgres
Processing file sixth_export.tsv
Header:
wosid	isopenaccess	openaccesstype	abstract	fundingtext	citedreferencecount	full_address	reprintaddress	articlenumber	publicationyear	publicationdate	volume	issue	partnumber	supplement	specialissue	earlyaccessdate	startpage	endpage	numberofpages	publishercity	publisheraddress	publisher	keywordplus	conferencedate	conferencesponsor	conferencehost	conferencetitle	documenttype	rids	orcid	standardnames	authors	emailaddress	papertitle	journaltitle	journalabbrev	journaliso	issn	doi	eissn	isbn	pmid	conferencelocation	fundingorgs	lc_standard_names
# of columns: 46
Finished processing file sixth_export.tsv, 
# of invalid records: 0/78395307
 ------- 477.3129072189331 seconds --------


In [53]:
folderList = [
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/nodes',
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/edges'
#     '/N/project/mag/wos_csv'
#    '/N/project/mag/wos_csv/tmp/'
    '/N/project/mag/wos_csv/fromPostgres'
]

delimiter = '\t'
logDir = 'output'

colIdx = -1
sizeThreshold = 32766

for folder in folderList:
    sanityCheckColLength(folder, delimiter, colIdx, sizeThreshold, logDir)

Processing folder /N/project/mag/wos_csv/fromPostgres
Processing file sixth_export.tsv
Header:
wosid	isopenaccess	openaccesstype	abstract	fundingtext	citedreferencecount	full_address	reprintaddress	articlenumber	publicationyear	publicationdate	volume	issue	partnumber	supplement	specialissue	earlyaccessdate	startpage	endpage	numberofpages	publishercity	publisheraddress	publisher	keywordplus	conferencedate	conferencesponsor	conferencehost	conferencetitle	documenttype	rids	orcid	standardnames	authors	emailaddress	papertitle	journaltitle	journalabbrev	journaliso	issn	doi	eissn	isbn	pmid	conferencelocation	fundingorgs	lc_standard_names
# of columns: 46
Finished processing file sixth_export.tsv, 
# of invalid records: 2040/78395307
 ------- 604.5593001842499 seconds --------


In [59]:
%%time

folderList = [
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/nodes',
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/edges'
#     '/N/project/mag/wos_csv'
#    '/N/project/mag/wos_csv/tmp/'
    '/N/project/mag/wos_csv/fromPostgres'
]

delimiter = '\t'
outDir = '/N/project/mag/wos_csv/fromPostgres'

colIdx = -1
sizeThreshold = 32766

for folder in folderList:
    filterByLength(folder, delimiter, colIdx, sizeThreshold, outDir)

Processing folder /N/project/mag/wos_csv/fromPostgres
Processing file sixth_export.tsv
Header:
wosid	isopenaccess	openaccesstype	abstract	fundingtext	citedreferencecount	full_address	reprintaddress	articlenumber	publicationyear	publicationdate	volume	issue	partnumber	supplement	specialissue	earlyaccessdate	startpage	endpage	numberofpages	publishercity	publisheraddress	publisher	keywordplus	conferencedate	conferencesponsor	conferencehost	conferencetitle	documenttype	rids	orcid	standardnames	authors	emailaddress	papertitle	journaltitle	journalabbrev	journaliso	issn	doi	eissn	isbn	pmid	conferencelocation	fundingorgs	lc_standard_names
# of columns: 46
Finished processing file sixth_export.tsv, 
# of invalid records: 2040/78395307
 ------- 729.5439267158508 seconds --------
CPU times: user 8min 42s, sys: 3min 27s, total: 12min 10s
Wall time: 12min 11s


In [41]:
def sanityCheckQuotationMarks(folder, delimiter, logDir):

    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    
    print('Processing folder {}'.format(folder))
    
    for filename in files:
        
        if filename not in ['sixth_export.tsv']:
            continue
            
        logFile = os.path.join(logDir, 'sc-' + os.path.basename(filename) + '.log')
        with open(logFile, 'w') as logfp:
            
            with open(os.path.join(folder, filename), 'r') as fp:

                logfp.write('Processing file {}\n'.format(filename))
                print('Processing file {}'.format(filename))

                start = time.time()

                recordCnt = 0
                numInvalidRecords = 0
                headerSet = False

                for line in fp:

                    line = line.strip('\n')

                    if not headerSet:
                        logfp.write('Header:\n{}\n'.format(line))
#                         print('Header:\n{}'.format(line))

                        numCols = len(line.split(delimiter))
                        logfp.write('# of columns: {}\n'.format(numCols))
#                         print('# of columns: {}'.format(numCols))

                        headerSet = True

                    else:

                        recordCnt += 1

                        fields = line.split(delimiter)
                        for field in fields:

                            if field.startswith('"') or field.endswith('"'):
                                logfp.write('line # {} below, field(s) with unremoved quotation mark\n{}\n'.\
                                  format(recordCnt + 1, line))
                                print('line # {} below, field(s) with unremoved quotation mark\n{}'.\
                                  format(recordCnt + 1, line))
                                numInvalidRecords += 1
                                break

                    if numInvalidRecords >= 5:
                        break

            print('Finished processing file {}'.format(filename))
            print('# of invalid records: {}/{}'.format(numInvalidRecords, recordCnt))
            print(" ------- {} seconds --------".format(time.time() - start))
            
            logfp.write('Finished processing file {}\n'.format(filename))
            logfp.write('# of invalid records: {}/{}\n'.format(numInvalidRecords, recordCnt))
            logfp.write(" ------- {} seconds --------\n".format(time.time() - start))
                    
                    

In [42]:
folderList = [
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/nodes',
#     '/N/project/rds/CADRE/dataset/mag/2021/combined/edges'
#     '/N/project/mag/wos_csv'
#     '/N/project/mag/wos_csv/tmp/'
    '/N/project/mag/wos_csv/fromPostgres'
]

delimiter = '\t'

logDir = 'output'

for folder in folderList:
    sanityCheckQuotationMarks(folder, delimiter, logDir)

Processing folder /N/project/mag/wos_csv/fromPostgres
Processing file sixth_export.tsv
Finished processing file sixth_export.tsv
# of invalid records: 0/78395307
 ------- 1833.9017338752747 seconds --------
