In [15]:
# change to store dataset
STORE_PUB = True

# change for different location

# locations: ravensburg, mannheim, heidenheim, karlsruhe, campus-horb, stuttgart
# locations: heilbronn, loerrach, mosbach, villingen-schwenningen

location = "karlsruhe"

In [16]:
import math
import re
from collections import Counter

import datetime
import pandas as pd
import ast

In [17]:
WORD = re.compile(r"\w+")

# get current year to save in applicable folder
current_year = datetime.date.today().year

In [18]:
### data import ###

J_RATINGS_SCIMAGO = pd.read_csv(f'../data/{current_year}/journal_ratings_scimago.csv', sep=";")
J_RATINGS_VHB = pd.read_csv(f'../data/{current_year}/journal_ratings_vhb.csv')

publications_gs = pd.read_csv(f'../data/{current_year}/publications_{location}_gs.csv')
publications_rg = pd.read_csv(f'../data/{current_year}/publications_{location}_rg.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [19]:
# functions to get similarity of strings using cosine similarity

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)


    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [20]:
def is_similar(text1, text2):
    
    vector1 = text_to_vector(str(text1).lower())
    vector2 = text_to_vector(str(text2).lower())

    cosine = get_cosine(vector1, vector2)
    
    if cosine > 0.6:
        similar = True
    else:
        similar = False
        
    return similar

In [21]:
def get_not_none_string(string1, string2):
    
    if string1 and not string2:
        return string1
    
    elif string2 and not string1:
        return string2
    
    elif string1 and string2:
        return max([string1, string2], key=len)
    
    else:
        return None

In [22]:
def get_ratings(publisher):
    
    publisher = str(publisher)

    if publisher == "nan":
        rating_vhb = None
        issn_vhb = None
        rating_sci = None
        issn_sci = None
        
    else:
        
        rating_database = J_RATINGS_VHB
        if publisher in rating_database['JOURNAL']:
            rating_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'JQ3'].to_numpy()[0]
            issn_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'ISSN'].to_numpy()[0]

        elif index_similar := [i for i, x in enumerate(rating_database['JOURNAL']) if is_similar(publisher,x)]:
            rating_vhb = rating_database.loc[index_similar[0], 'JQ3']
            issn_vhb = rating_database.loc[index_similar[0], 'ISSN']

        else:
            rating_vhb = None
            issn_vhb = None


        rating_database = J_RATINGS_SCIMAGO
        if publisher in rating_database['Title']:
            rating_sci = rating_database.loc[rating_database['Title'] == publisher, 'H index'].to_numpy()[0]
            issn_sci = rating_database.loc[rating_database['Title'] == publisher, 'Issn'].to_numpy()[0]

        elif index_similar := [i for i, x in enumerate(rating_database['Title']) if is_similar(publisher,x)]:
            rating_sci = rating_database.loc[index_similar[0], 'H index']
            issn_sci = rating_database.loc[index_similar[0], 'Issn']

        else:
            rating_sci = None
            issn_sci = None

        
    return rating_vhb, rating_sci

In [23]:
def combine_publication(row_gs, row_rg):
    
    title = get_not_none_string(str(row_gs['PUB_TITLE']), str(row_rg['PUB_TITLE']))
    year = max([row_gs['PUB_YEAR'], row_rg['PUB_YEAR']])
    publisher = get_not_none_string(str(row_gs['PUB_PUBLISHER']), str(row_rg['PUB_PUBLISHER']))
    
    authors_gs = [n.strip() for n in ast.literal_eval(row_gs["PUB_AUTHORS"])]
    authors_rg = [n.strip() for n in ast.literal_eval(row_rg["PUB_AUTHORS"])]
    authors = list(set(authors_gs + authors_rg))
    
    data = {
        'AUTHOR_NAME': row_gs["AUTHOR_NAME"],
        'PUB_TITLE': title, 
        'PUB_YEAR': year, 
        'PUB_AUTHORS': authors,  
        'PUB_TYPE': row_rg['PUB_TYPE'],
        'PUB_PUBLISHER': publisher,
        'PUB_CITATIONS': row_gs["PUB_CITATIONS"],
        'PUB_ISBN': row_rg['PUB_ISBN'],
        'PUB_DOI': row_rg['PUB_DOI']
       }

    return data

In [24]:
# get all unique names
all_author_names = publications_gs['AUTHOR_NAME'].tolist() + publications_rg['AUTHOR_NAME'].tolist()
all_author_names = set(all_author_names) # make unique

In [25]:
pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 
                                                      'PUB_AUTHORS', 'PUB_TYPE', 'PUB_PUBLISHER', 
                                                      'PUB_ISBN', 'PUB_DOI', 'PUB_CITATIONS', 'JOU_RATING_VHB', 'JOU_RATING_SCIMAGO'])

In [26]:
if publications_gs.empty:
    pd_result = publications_rg
    
elif publications_rg.empty:
    pd_result = publications_gs
    
else:

    for author_name in all_author_names:

        subset_test_gs = publications_gs[publications_gs['AUTHOR_NAME'] == author_name]
        subset_test_rg = publications_rg[publications_rg['AUTHOR_NAME'] == author_name]

        leftover_rg = subset_test_rg.copy()

        for index_gs, first_row_gs in subset_test_gs.iterrows():
            match_found = False

            # iterate over RG subset
            for index_rg, row in subset_test_rg.iterrows():

                # determining identical based on title and publisher
                if is_similar(row["PUB_TITLE"], first_row_gs["PUB_TITLE"]) and \
                    is_similar(str(row["PUB_PUBLISHER"]), str(first_row_gs["PUB_PUBLISHER"])): 

                    match_found = True

                    combined_row = combine_publication(first_row_gs, row)
                    pd_result = pd_result.append(combined_row, ignore_index=True, sort=False)

                    # remove the found match from the subsets
                    #subset_test_gs = subset_test_gs.drop(index_gs).reset_index(drop=True)
                    if index_rg in leftover_rg.index:
                        leftover_rg = leftover_rg.drop(index_rg)

            if not match_found:
                pd_result = pd_result.append(first_row_gs, ignore_index=True, sort=False)
                #subset_test_gs = subset_test_gs.drop(index_gs).reset_index(drop=True)

            # if there are RG articles left, append them
            if (index_gs + 1 == len(subset_test_gs)) and (len(leftover_rg) > 0):
                pd_result = pd_result.append(leftover_rg, ignore_index=True, sort=False)


pd_result[['JOU_RATING_VHB', 'JOU_RATING_SCIMAGO']] = pd_result.apply(\
        lambda row: get_ratings(row['PUB_PUBLISHER']), axis=1, result_type='expand')

In [27]:
if STORE_PUB:
    pd_result.to_csv(f'../data/{current_year}/publications_{location}.csv', index=False)

In [28]:
pd_result.head(20)

Unnamed: 0,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_TYPE,PUB_PUBLISHER,PUB_ISBN,PUB_DOI,PUB_CITATIONS,JOU_RATING_VHB,JOU_RATING_SCIMAGO,UPDATED
0,Margitte Müller,Der Gesundheitssektor in der Grenzregion Ostbr...,2006.0,['M Müller '],,,,,,,,2022-04-06
1,Albrecht Nick,Improved Acoustic Behavior of Interior Parts o...,2002.0,"['A Nick', ' U Becker', ' W Thoma']",,Journal of Polymers & the …,,,80.0,C,431.0,2022-04-06
2,Marcus Strand,Using an attributed 2D-grid for next-best-view...,2008.0,"['M Strand', ' R Dillmann']",,International Conference on Information and Au...,,,31.0,,187.0,2022-04-06
3,Marcus Strand,Vision and ToF-based driving assistance for a ...,2009.0,"['T Schamm', ' M Strand', ' T Gumpp', ' R Kohl...",,International Conference on Advanced Robotics,,,25.0,,187.0,2022-04-06
4,Marcus Strand,Autonomous robot navigation in human-centered ...,2007.0,"['P Steinhaus', ' M Strand', ' R Dillmann']",,Eurasip Journal on Advances in Signal Processing,,,19.0,,120.0,2022-04-06
5,Marcus Strand,Range image registration using an octree based...,2007.0,"['M Strand', ' F Erb', ' R Dillmann']",,International Conference on Mechatronics and A...,,,17.0,,187.0,2022-04-06
6,Marcus Strand,Proceedings of the 2011 IEEE International Con...,2011.0,"['Z Xue', ' S Xia', ' M Strand', ' JM Zoellner...",,,,,8.0,,,2022-04-06
7,Marcus Strand,Segmentation and approximation of objects in p...,2009.0,"['M Strand', ' R Dillmann']",,International Conference on Information and Au...,,,8.0,,187.0,2022-04-06
8,Marcus Strand,Using superquadrics for the approximation of o...,2010.0,"['M Strand', ' Z Xue', ' M Zoellner', ' R Dill...",,The IEEE International Conference on Informati...,,,7.0,,187.0,2022-04-06
9,Marcus Strand,Energy Efficient Driving and Operation Strateg...,2012.0,"['D Nienhüser', ' T Bär', ' R Kohlhaas', ' T S...",,Oldenbourg Wissenschaftsverlag GmbH,,,4.0,,,2022-04-06
