-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepareForGoogleScholar.py
105 lines (78 loc) · 3.82 KB
/
prepareForGoogleScholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
from glob import iglob
import os
import sys
import json
import math
inputFilePath=""
inputFileName="grobid_output.json"
input_file2=open(inputFilePath+inputFileName,'r')
teiData = json.loads(input_file2.read())
input_file2.close()
count=0
emptyTitle=0
for files in teiData["TEIAndRelatedData"]:
#print count
extractedTitle=""
#use title to get related publications
if files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]!="" and files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"].get("title",0)!=0:
if type(files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["title"]) is list:
extractedTitle=files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["title"][0]["content"]
removeChars=[]
for chars in extractedTitle:
if chars!=',' and chars!=' ' and ( chars<'0' or (chars>'9' and chars<'A') or (chars>'Z' and chars<'a') or chars>'z'):
removeChars.append(chars)
for char in removeChars:
extractedTitle=extractedTitle.replace(char, "")
extractedTitle=extractedTitle.replace(" ", ",")
extractedTitle=extractedTitle.replace(",,", ",")
else:
extractedTitle=files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["title"]["content"]
removeChars=[]
for chars in extractedTitle:
if chars!=',' and chars!=' ' and ( chars<'0' or (chars>'9' and chars<'A') or (chars>'Z' and chars<'a') or chars>'z'):
removeChars.append(chars)
for char in removeChars:
extractedTitle=extractedTitle.replace(char, "")
extractedTitle=extractedTitle.replace(" ", ",")
extractedTitle=extractedTitle.replace(",,", ",")
else:
emptyTitle=emptyTitle+1
# print extractedTitle
teiData["TEIAndRelatedData"][count]["extractedTitle"]=extractedTitle
# if files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"].get("author",0)!=0:
# #use author name to get related publications
# if type(files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]) is list:
# #multiple authors
# fullNames=[]
# for names in files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]:
# fullName=''
# if names["persName"].get("forename",0)!=0:
# if type(names["persName"]["forename"]) is list:
# for foreNames in names["persName"]["forename"]:
# fullName=fullName+foreNames["content"]+' '
# else:
# fullName=fullName+names["persName"]["forename"]["content"]+' '
# if names["persName"].get("surname",0)!=0:
# fullName=fullName+names["persName"]["surname"]
# fullName=fullName.strip()
# print count," ",fullName
# else:
# #one author
# fullName=''
# if files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]["persName"].get("forename",0)!=0:
# if type(files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]["persName"]["forename"]) is list:
# for foreNames in files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]["persName"]["forename"]:
# fullName=fullName+foreNames["content"]+' '
# else:
# fullName=fullName+files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]["persName"]["forename"]["content"]+' '
# if files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]["persName"].get("surname",0)!=0:
# fullName=fullName+files["TEI"]["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"]["author"]["persName"]["surname"]
# fullName=fullName.strip()
# print count," ",fullName
count=count+1
print emptyTitle
keys=json.dumps(teiData, sort_keys=True)
output_file=open(inputFilePath+inputFileName,'w')
output_file.write(keys)
output_file.close()