# Create a corrections table from the spreadsheet

## Import modules

In [1]:
import pandas as pd

## Read the spreadsheet

In [2]:
df = pd.read_csv('zotero_keywords.csv')

df

Unnamed: 0,Cleaned_Version (new),Tag (existent),Other variants (= other existent tags),German variants (already existent),Count,German translation,Broader concept,Susanne Cat 1,Susanne Cat 2,Susanne Cat 3,...,new_2,cat_2,new_3,cat_3,new_4,cat_4,new_5,cat_5,new_6,cat_6
0,(semi-)automated generated,(teil-)automatisch generiert,,(teil-)automatisch generiert,4.0,(teil-)automatisch generiert,,,,,...,,,,,,,,,,
1,,#nosource,,,4.0,,,object-type,resource/tool,topic,...,,,,,,,,,,
2,medieval england (1066-1485),1066-1485,,,1.0,1066-1485,,time period,,,...,,,,,,,,,,
3,,1922,,,1.0,1922,,time period,,,...,,,,,,,,,,
4,,2001: a space odyssey (film),,,1.0,,,entity (product),topic,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438,,x-check,,,58.0,,,,,,...,,,,,,,,,,
1439,extensible markup language (xml),xml,xml schema,,16.0,XML,,format/standard,,,...,,,,,,,,,,
1440,year in dh 2008,yearindh2008,,,1.0,DH-Jahr 2008,,,,,...,,,,,,,,,,
1441,,youth,,,1.0,Jugend,,,,,...,,,,,,,,,,


## Get list of original tags according to spreadsheet

In [3]:
original_tags = []

# Tag (existent)
original_tags.extend(df.iloc[:,1].dropna().values.tolist())
# Other variants (= other existent tags)
original_tags.extend([item for sublist in df.iloc[:,2].dropna().apply(lambda x: x.split('; ')).values.tolist() for item in sublist])
# German variants (already existent)
original_tags.extend([item for sublist in df.iloc[:,3].dropna().apply(lambda x: x.split('; ')).values.tolist() for item in sublist])

print(len(original_tags))

1622


## Transform in keys for dictionary

In [4]:
corrections = {key: None for key in original_tags}
print(len(corrections))

1589


## Adding correction steps described in spreadsheet

In [5]:
for index, row in df.iterrows():
    if not pd.isna(row[3]):
        for key in row[3].split("; "):
            if corrections[key] == None:
                corrections[key] = row[1]
            else:
                print(f"Correction already present for tag: {key}")

In [6]:
for index, row in df.iterrows():
    if not pd.isna(row[2]):
        for key in row[2].split("; "):
            if corrections[key] == None:
                corrections[key] = row[1]
            else:
                if corrections[key] != row[1]:
                    print(f'Adding step for tag "{key}":')
                    corrections[key] = [corrections[key], row[1]]
                    print("    " + str(corrections[key]))

Adding step for tag "scholars":
    ['academic', 'academics']
Adding step for tag "the arts":
    ['art / general', 'arts']
Adding step for tag "computers / general":
    ['computer', 'computers']
Adding step for tag "databases":
    ['data bases', 'database']
Adding step for tag "digitized":
    ['digitisation', 'digitize']
Adding step for tag "higher ed":
    ['education, higher', 'higher education']
Adding step for tag "handschriften. epigrafie. paleografie":
    ['manuscripts', 'palaeography']


In [7]:
for index, row in df.iterrows():
    if not pd.isna(row[1]):
        key = row[1]
        if not pd.isna(row[0]):
            if corrections[key] == None:
                corrections[key] = row[0]
            else:
                if type(corrections[key]) is list:
                    if corrections[key][-1] != row[0]:
                        print(f'Adding step to tag "{key}":')
                        corrections[key].append(row[0])
                        print("    ", str(corrections[key]))
                else:
                    corrections[key] = [corrections[key], row[0]]
    else:
        print("Value is a NaN!")

Adding step to tag "handschriften. epigrafie. paleografie":
     ['manuscripts', 'palaeography', 'manuscripts. epigraphy. palaeography']


In [8]:
for index, row in df.iterrows():
    if not pd.isna(row[0]):
        if not pd.isna(row[0]):
            for key in corrections:
                if type(corrections[key]) is not list:
                    if corrections[key] == row[1]:
                        corrections[key] = [corrections[key], row[0]]
                        print(corrections[key])
                else:
                    if corrections[key][-1] == row[1]:
                        corrections[key].append(row[0])
                        print(corrections[key])

['550 geowissenschaften', 'geosciences']
['academic', 'academics', 'academic']
['analyze qualitatively', 'analysing qualitatively']
['annotation', 'annotating']
['annotation', 'annotating']
['art / general', 'art']
['artefacts', 'artifact']
['art / general', 'arts', 'art']
['collaboration', 'collaborating']
['collaboration', 'collaborating']
['collaboration', 'collaborating']
['collocation', 'collocating']
['comics', 'comic']
['communication', 'communicating']
['communication', 'communicating']
['communication', 'communicating']
['computational linguistic', 'computational linguistics']
['computational linguistic', 'computational linguistics']
['computer', 'computers', 'computer']
['conceptualizing', 'conceptualising']
['creation', 'creating']
['cv', 'curriculum vitae (cv)']
['data bases', 'database']
['digital editions', 'digital scholarly edition']
['digital editions', 'digital scholarly edition']
['digital libraries', 'digital library']
['digital media', 'new media']
['digitisation',

In [9]:
for index, row in df.iterrows():
    key = row[1]
    if not pd.isna(row[16]):
        if corrections[key] == None:
            corrections[key] = "❌ DELETE"
        else:
            if type(corrections[key]) is not list:
                print(f'Adding step for tag {key}:')
                corrections[key] = [corrections[key], "❌ DELETE"]
                print("    ", str(corrections[key]))
            else:
                print(f'Adding step for tag {key}:')
                corrections[key].append("❌ DELETE")
                print("    ", str(corrections[key]))

Adding step for tag (teil-)automatisch generiert:
     ['(teil-)automatisch generiert', '(semi-)automated generated', '❌ DELETE']
Adding step for tag ausgedruckt:
     ['printed', '❌ DELETE']
Adding step for tag ayers, edward l.:
     ['edward l. ayers', '❌ DELETE']
Adding step for tag berlin (hu):
     ['hu berlin', '❌ DELETE']
Adding step for tag berlin (tu):
     ['tu berlin', '❌ DELETE']
Adding step for tag berners-lee, tim:
     ['tim berners-lee', '❌ DELETE']
Adding step for tag brand, stewart:
     ['stewart brand', '❌ DELETE']
Adding step for tag broch, hermann:
     ['hermann broch', '❌ DELETE']
Adding step for tag bush, vannevar:
     ['vannevar bush', '❌ DELETE']
Adding step for tag california (southern):
     ['southern california', '❌ DELETE']
Adding step for tag chaucer, geoffrey:
     ['geoffrey chaucer', '❌ DELETE']
Adding step for tag clinton, bill:
     ['bill clinton', '❌ DELETE']
Adding step for tag crestani, marcus:
     ['marcus crestani', '❌ DELETE']
Adding step 

In [10]:
for index, row in df.iterrows():
    key = row[1]
    if not pd.isna(row[17]):
        if corrections[key] == None:
            corrections[key] = "❌ DELETE_TEXTUAL"
        else:
            if type(corrections[key]) is not list:
                print(f'Adding step for tag {key}:')
                corrections[key] = [corrections[key], "❌ DELETE_TEXTUAL"]
                print("    ", str(corrections[key]))
            else:
                print(f'Adding step for tag {key}:')
                corrections[key].append("❌ DELETE_TEXTUAL")
                print("    ", str(corrections[key]))

Adding step for tag bellissimo:
     ['beautiful', '❌ DELETE_TEXTUAL']
Adding step for tag curiosité_perso:
     ['personal curiosity', '❌ DELETE_TEXTUAL']
Adding step for tag cv:
     ['curriculum vitae (cv)', '❌ DELETE_TEXTUAL']
Adding step for tag documents:
     ['document', '❌ DELETE_TEXTUAL']
Adding step for tag drafts:
     ['draft', '❌ DELETE_TEXTUAL']
Adding step for tag examples:
     ['example', '❌ DELETE_TEXTUAL']
Adding step for tag generalities:
     ['generality', '❌ DELETE_TEXTUAL']
Adding step for tag hss:
     ['hochschulschrift', '❌ DELETE_TEXTUAL']
Adding step for tag ideas:
     ['idea', '❌ DELETE_TEXTUAL']
Adding step for tag is teams:
     ['is team', '❌ DELETE_TEXTUAL']
Adding step for tag large lexicon fields:
     ['large lexicon field', '❌ DELETE_TEXTUAL']
Adding step for tag logiciel txm:
     ['tmx software', '❌ DELETE_TEXTUAL']
Adding step for tag nhsmukomuc:
     ['#nhsmukomuc', '❌ DELETE_TEXTUAL']
Adding step for tag nouvelles technologies:
     ['new te

In [11]:
for index, row in df.iterrows():
    key = row[1]
    if not pd.isna(row[18]):
        if corrections[key] == None:
            corrections[key] = ["✂️ SPLIT"]
            print("No problem!")
        else:
            if type(corrections[key]) is not list:
                print(f'Adding step for tag {key}:')
                corrections[key] = [corrections[key], "✂️ SPLIT"]
                print("    ", str(corrections[key]))
            else:
                print(f'Adding step for tag {key}:')
                corrections[key].append("✂️ SPLIT")
                print("    ", str(corrections[key]))
        indices = [19, 21, 23, 25, 27, 29]
        split_tags = []
        for i in indices:
            if not pd.isna(row[i]):
                split_tags.append(row[i])
        print("    ", str(split_tags))
        corrections[key].append(split_tags)

Adding step for tag administration technological innovations:
     ['administration technological innovation', '✂️ SPLIT']
     ['administration', 'technological innovation']
No problem!
     ['algorithm design', 'algorithm analysis']
No problem!
     ['annotation system', 'system design', 'annotation design']
Adding step for tag applications and critiques:
     ['application and critique', '✂️ SPLIT']
     ['application', 'critique']
No problem!
     ['autocad', 'autolisp']
No problem!
     ['automating', 'data collection']
No problem!
     ['automating', 'linguistic analysis']
No problem!
     ['bibliography theory', 'bibliography methodology']
No problem!
     ['book', 'literature']
No problem!
     ['book', 'art']
No problem!
     ['brute force method', 'brute force']
No problem!
     ['business', 'economics', 'commerce']
No problem!
     ['business', 'economics', 'statistics']
No problem!
     ['language change', 'change over time', 'linguistic change']
No problem!
     ['characte

In [12]:
df_list = []

for key, value in corrections.items():
    tag_list = [key]
    if value != None:
        if type(value) is list:
            tag_list.extend(value)
        else:
            tag_list.append(value)
    df_list.append(tag_list)
    print(tag_list)

['(teil-)automatisch generiert', '(teil-)automatisch generiert', '(semi-)automated generated', '❌ DELETE']
['#nosource', '❌ DELETE']
['1066-1485', 'medieval england (1066-1485)']
['1922', '❌ DELETE_TEXTUAL']
['2001: a space odyssey (film)', '❌ DELETE']
['2011', '❌ DELETE_TEXTUAL']
['3d']
['550 geowissenschaften', 'geosciences']
['abe', '❌ DELETE']
['academia']
['academic']
['academics', 'academic', 'academic']
['acceso', 'accessing']
['active learning']
['activism']
['adafruit', '❌ DELETE']
['adam', '❌ DELETE']
['address book']
['administration']
['administration technological innovations', 'administration technological innovation', '✂️ SPLIT', ['administration', 'technological innovation']]
['advocating']
['africa', '❌ DELETE']
['african american']
['age']
['agenda', '❌ DELETE_TEXTUAL']
['agora', '❌ DELETE']
['agriculture']
['ailion', '❌ DELETE']
['aims news', '❌ DELETE_TEXTUAL']
['àlex', '❌ DELETE']
['algorithm design and analysis', '✂️ SPLIT', ['algorithm design', 'algorithm analysi

## Create dataframe with corrections

In [13]:
max_length = max(len(lst) for lst in df_list)
print(max_length)

6


In [14]:
columns = ["original"]
for i in range(1,max_length):
    columns.append(f"step_{i}")
print(columns)

['original', 'step_1', 'step_2', 'step_3', 'step_4', 'step_5']


In [15]:
table = pd.DataFrame(df_list, columns=columns)
table

Unnamed: 0,original,step_1,step_2,step_3,step_4,step_5
0,(teil-)automatisch generiert,(teil-)automatisch generiert,(semi-)automated generated,❌ DELETE,,
1,#nosource,❌ DELETE,,,,
2,1066-1485,medieval england (1066-1485),,,,
3,1922,❌ DELETE_TEXTUAL,,,,
4,2001: a space odyssey (film),❌ DELETE,,,,
...,...,...,...,...,...,...
1584,philosophie,philosophy,,,,
1585,wissenschaftliche gemeinschaft,scientific community,,,,
1586,wissenschaftsgemeinschaft,scientific community,,,,
1587,wissenschaftliche fachgemeinschaft,scientific community,,,,


## Export to CSV

In [16]:
table.to_csv('corrections.csv', encoding='utf-8', index=False)

## Create shorter version

In [17]:
final_corrections = {}

for index, row in table.iterrows():
    start = row.iloc[0]
    last_cell = table.loc[index, row.last_valid_index()]
    if last_cell == "❌ DELETE" or last_cell == "❌ DELETE_TEXTUAL":
        end = False
    else:
        end = last_cell
    final_corrections[start] = end

In [18]:
import json

with open('final_corrections.json', 'w') as json_file:
    json.dump(final_corrections, json_file)