In [25]:
# change to store dataset
STORE_PUB = True

locations = ['ravensburg', 'mannheim', 'heidenheim', 'karlsruhe', 'campus-horb', 
             'stuttgart', 'heilbronn', 'loerrach', 'mosbach', 'villingen-schwenningen']

In [26]:
import math
import re
from collections import Counter

import datetime
import pandas as pd
import ast

In [27]:
# get current year to save in applicable folder
current_year = datetime.date.today().year

In [28]:
# functions to get similarity of strings using cosine similarity

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)


    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [29]:
def is_similar(text1, text2):
    
    vector1 = text_to_vector(str(text1).lower())
    vector2 = text_to_vector(str(text2).lower())

    cosine = get_cosine(vector1, vector2)
    
    if cosine > 0.6:
        similar = True
    else:
        similar = False
        
    return similar

In [30]:
def get_not_none_string(string1, string2):
    
    if string1 and not string2:
        return string1
    
    elif string2 and not string1:
        return string2
    
    elif string1 and string2:
        return max([string1, string2], key=len)
    
    else:
        return None

In [31]:
def combine_publication(row_gs, row_rg):
    
    title = get_not_none_string(str(row_gs['PUB_TITLE']), str(row_rg['PUB_TITLE']))
    
    # title sometimes contains information about type
    if "[BOOK]" in title:
        pub_type = "book"
    else:
        pub_type = row_rg['PUB_TYPE']
    
    title = re.sub("\[[A-Z ]+\]", "", title)
    
    year = max([row_gs['PUB_YEAR'], row_rg['PUB_YEAR']])
    publisher = get_not_none_string(str(row_gs['PUB_PUBLISHER']), str(row_rg['PUB_PUBLISHER']))
    
    authors_gs = [n.strip() for n in ast.literal_eval(row_gs["PUB_AUTHORS"])]
    authors_rg = [n.strip() for n in ast.literal_eval(row_rg["PUB_AUTHORS"])]
    authors = list(set(authors_gs + authors_rg))
    
    data = {
        'AUTHOR_NAME': row_gs["AUTHOR_NAME"],
        'PUB_TITLE': title, 
        'PUB_YEAR': year, 
        'PUB_AUTHORS': authors,  
        'PUB_TYPE': pub_type,
        'PUB_PUBLISHER': publisher,
        'PUB_CITATIONS': row_gs["PUB_CITATIONS"],
        'PUB_ISBN': row_rg['PUB_ISBN'],
        'PUB_DOI': row_rg['PUB_DOI']
       }

    return data

In [32]:
for location in locations:

    publications_gs = pd.read_csv(f'../data/{current_year}/publications_{location}_gs.csv')
    publications_rg = pd.read_csv(f'../data/{current_year}/publications_{location}_rg.csv')

    # get all unique names
    all_author_names = publications_gs['AUTHOR_NAME'].tolist() + publications_rg['AUTHOR_NAME'].tolist()
    all_author_names = set(all_author_names) # make unique

    pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 
                                      'PUB_AUTHORS', 'PUB_TYPE', 'PUB_PUBLISHER', 
                                      'PUB_ISBN', 'PUB_DOI', 'PUB_CITATIONS', 
                                        ])

    if publications_gs.empty:
        pd_result = publications_rg

    elif publications_rg.empty:
        pd_result = publications_gs

    else:

        for author_name in all_author_names:

            subset_test_gs = publications_gs[publications_gs['AUTHOR_NAME'] == author_name]
            subset_test_rg = publications_rg[publications_rg['AUTHOR_NAME'] == author_name]

            leftover_rg = subset_test_rg.copy()

            for index_gs, first_row_gs in subset_test_gs.iterrows():
                match_found = False

                # iterate over RG subset
                for index_rg, row in subset_test_rg.iterrows():

                    # determining identical based on title and publisher
                    if is_similar(row["PUB_TITLE"], first_row_gs["PUB_TITLE"]): # and \
                        # is_similar(str(row["PUB_PUBLISHER"]), str(first_row_gs["PUB_PUBLISHER"])): 

                        match_found = True

                        combined_row = combine_publication(first_row_gs, row)
                        pd_result = pd_result.append(combined_row, ignore_index=True, sort=False)

                        # remove the found match from the subsets
                        if index_rg in leftover_rg.index:
                            leftover_rg = leftover_rg.drop(index_rg)

                if not match_found:
                    pd_result = pd_result.append(first_row_gs, ignore_index=True, sort=False)

                # if there are RG articles left, append them
                if (index_gs + 1 == len(subset_test_gs)) and (len(leftover_rg) > 0):
                    pd_result = pd_result.append(leftover_rg, ignore_index=True, sort=False)

    if STORE_PUB:
        pd_result.to_csv(f'../data/{current_year}/publications_{location}.csv', index=False)
        print(f"{location} {current_year}: Successfully combined publications.")

ravensburg 2022: Successfully combined publications.
mannheim 2022: Successfully combined publications.
heidenheim 2022: Successfully combined publications.
karlsruhe 2022: Successfully combined publications.
campus-horb 2022: Successfully combined publications.
stuttgart 2022: Successfully combined publications.
heilbronn 2022: Successfully combined publications.
loerrach 2022: Successfully combined publications.
mosbach 2022: Successfully combined publications.
villingen-schwenningen 2022: Successfully combined publications.
