-
Notifications
You must be signed in to change notification settings - Fork 0
/
newcombine.py
67 lines (58 loc) · 2.19 KB
/
newcombine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from datetime import datetime
import ast
import pandas as pd
fileList = ['convertedMatchesReviewed_BatchC_2020-03-19.csv']
def addType(vocabType):
if vocabType == 'fast':
subjectDict['newKey'] = 'dc.subject.fast'
subject_list.append(subjectDict)
elif vocabType == 'mesh':
subjectDict['newKey'] = 'dc.subject.mesh'
subject_list.append(subjectDict)
else:
error_list.append(subjectDict)
error_list = []
subject_list = []
for filename in fileList:
df_subjects = pd.read_csv(filename, header=0)
ori_total = df_subjects.search_subject.size
print(ori_total)
for index, row in df_subjects.iterrows():
print(str(ori_total-index)+' left')
row = dict(row)
subjectDict = row.copy()
subjectDict['oldKey'] = 'dc.subject'
subjectDict['oldValue'] = row['old_subject']
search_subject = row['search_subject']
vocabType = row['type']
results = row['results']
if isinstance(results, float) or results == 'none':
print(results)
subjectDict['newKey'] = 'dc.subject'
subjectDict['newValue'] = search_subject
subject_list.append(subjectDict)
elif '[' or '|' in results:
if '[' in results:
results = ast.literal_eval(results)
results = [x.strip() for x in results]
results = '|'.join(results)
subjectDict['newValue'] = results
addType(vocabType)
elif '|' in results:
results = results.split('|')
results = [x.strip() for x in results]
results = '|'.join(results)
subjectDict['newValue'] = results
addType(vocabType)
else:
results = results.strip()
subjectDict['newValue'] = results
addType(vocabType)
print('{} errors found'.format(len(error_list)))
dt = datetime.now().strftime('%Y-%m-%d %H.%M.%S')
df = pd.DataFrame.from_dict(subject_list)
df2 = pd.DataFrame.from_dict(error_list)
subjectFile = 'subjectsCombined_'+dt+'.csv'
errorFile = 'errors_Batch_'+dt+'.csv'
df.to_csv(subjectFile, index=False)
df2.to_csv(errorFile, index=False)