In [1]:
import math
import re
from collections import Counter

import datetime
import pandas as pd
import ast

In [2]:
WORD = re.compile(r"\w+")

# get current year to save in applicable folder
current_year = datetime.date.today().year

In [3]:
### data import ###
J_RATINGS_SCIMAGO = pd.read_csv(f'../data/{current_year}/journal_ratings_scimago.csv', sep=";")
J_RATINGS_VHB = pd.read_csv(f'../data/{current_year}/journal_ratings_vhb.csv')

publications_test_gs = pd.read_csv(f'../data/{current_year}/publications_test_gs.csv')
publications_test_rg = pd.read_csv(f'../data/{current_year}/publications_test_rg.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# functions to get similarity of strings using cosine similarity

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)


    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [5]:
def is_similar(text1, text2):
    
    vector1 = text_to_vector(text1.lower())
    vector2 = text_to_vector(text2.lower())

    cosine = get_cosine(vector1, vector2)
    
    if cosine > 0.6:
        similar = True
    else:
        similar = False
        
    return similar

In [6]:
def get_not_none_string(string1, string2):
    
    if string1 and not string2:
        return string1
    
    elif string2 and not string1:
        return string2
    
    elif string1 and string2:
        return max([string1, string2], key=len)
    
    else:
        return None

In [7]:
def get_ratings(publisher):
    publisher = str(publisher)

    rating_database = J_RATINGS_VHB
    if publisher in rating_database['JOURNAL']:
        rating_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'JQ3'].to_numpy()[0]
        issn_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'ISSN'].to_numpy()[0]

    elif index_similar := [i for i, x in enumerate(rating_database['JOURNAL']) if is_similar(publisher,x)]:
        rating_vhb = rating_database.loc[index_similar[0], 'JQ3']
        issn_vhb = rating_database.loc[index_similar[0], 'ISSN']

    else:
        rating_vhb = None
        issn_vhb = None
        
    
    rating_database = J_RATINGS_SCIMAGO
    if publisher in rating_database['Title']:
        rating_sci = rating_database.loc[rating_database['Title'] == publisher, 'H index'].to_numpy()[0]
        issn_sci = rating_database.loc[rating_database['Title'] == publisher, 'Issn'].to_numpy()[0]

    elif index_similar := [i for i, x in enumerate(rating_database['Title']) if is_similar(publisher,x)]:
        rating_sci = rating_database.loc[index_similar[0], 'H index']
        issn_sci = rating_database.loc[index_similar[0], 'Issn']

    else:
        rating_sci = None
        issn_sci = None
       

#     if issn_vhb and issn_sci:
#         if issn_vhb != issn_sci:
#             print(f"ISSN are not the same for found publisher: {publisher}.")
#         else:
#             print(f"ISSN are the same for found publisher: {publisher}.")

        
    return rating_vhb, rating_sci

In [8]:
def combine_publication(row_gs, row_rg):
    
    title = get_not_none_string(str(row_gs['PUB_TITLE']), str(row_rg['PUB_TITLE']))
    year = max([row_gs['PUB_YEAR'], row_rg['PUB_YEAR']])
    publisher = get_not_none_string(str(row_gs['PUB_PUBLISHER']), str(row_rg['PUB_PUBLISHER']))
    
    authors_gs = [n.strip() for n in ast.literal_eval(row_gs["PUB_AUTHORS"])]
    authors_rg = [n.strip() for n in ast.literal_eval(row_rg["PUB_AUTHORS"])]
    authors = list(set(authors_gs + authors_rg))
    
    data = {
        'AUTHOR_NAME': row_gs["AUTHOR_NAME"],
        'PUB_TITLE': title, 
        'PUB_YEAR': year, 
        'PUB_AUTHORS': authors,  
        'PUB_TYPE': row_rg['PUB_TYPE'],
        'PUB_PUBLISHER': publisher,
        'PUB_CITATIONS': row_gs["PUB_CITATIONS"],
        'PUB_ISBN': row_rg['PUB_ISBN'],
        'PUB_DOI': row_rg['PUB_DOI']
       }

    return data

In [9]:
# get all unique names
all_author_names = publications_test_gs['AUTHOR_NAME'].tolist() + publications_test_rg['AUTHOR_NAME'].tolist()
all_author_names = set(all_author_names) # make unique

In [10]:
pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 
                                                      'PUB_AUTHORS', 'PUB_TYPE', 'PUB_PUBLISHER', 
                                                      'PUB_ISBN', 'PUB_DOI', 'PUB_CITATIONS', 'JOU_RATING_VHB', 'JOU_RATING_SCIMAGO'])

In [19]:

for author_name in all_author_names:
    
    subset_test_gs = publications_test_gs[publications_test_gs['AUTHOR_NAME'] == author_name]
    subset_test_rg = publications_test_rg[publications_test_rg['AUTHOR_NAME'] == author_name]

    leftover_rg = subset_test_rg.copy()

    for index_gs, first_row_gs in subset_test_gs.iterrows():
        match_found = False

        # iterate over RG subset
        for index_rg, row in subset_test_rg.iterrows():

            # determining identical based on title and publisher
            if is_similar(row["PUB_TITLE"], first_row_gs["PUB_TITLE"]) and \
                is_similar(str(row["PUB_PUBLISHER"]), str(first_row_gs["PUB_PUBLISHER"])): 

                match_found = True

                combined_row = combine_publication(first_row_gs, row)
                pd_result = pd_result.append(combined_row, ignore_index=True, sort=False)

                # remove the found match from the subsets
                #subset_test_gs = subset_test_gs.drop(index_gs).reset_index(drop=True)
                if index_rg in leftover_rg.index:
                    leftover_rg = leftover_rg.drop(index_rg)

        if not match_found:
            pd_result = pd_result.append(first_row_gs, ignore_index=True, sort=False)
            #subset_test_gs = subset_test_gs.drop(index_gs).reset_index(drop=True)

        # if there are RG articles left, append them
        if (index_gs + 1 == len(subset_test_gs)) and (len(leftover_rg) > 0):
            pd_result = pd_result.append(leftover_rg, ignore_index=True, sort=False)


pd_result[['JOU_RATING_VHB', 'JOU_RATING_SCIMAGO']] = pd_result.apply(lambda row: get_ratings(row['PUB_PUBLISHER']), axis=1, result_type='expand')

In [20]:
pd_result.head(20)

Unnamed: 0.1,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_TYPE,PUB_PUBLISHER,PUB_ISBN,PUB_DOI,PUB_CITATIONS,JOU_RATING_VHB,JOU_RATING_SCIMAGO,Unnamed: 0
0,Gerhard Hellstern,Analysis of a hybrid quantum network for class...,2021.0,['G Hellstern'],,IET Quantum Communication,,,,,,
1,Gerhard Hellstern,[BOOK][B] IT-Risiken in Banken: aufsichtliches...,2019.0,"['G Hellstern', ' P Buchmüller ']",,,,,1.0,,8.0,
2,Gerhard Hellstern,"Bashford, Jim University of Adelaide",1999.0,"['L Bland', ' S Brodsky', ' SC Choe', ' W Detm...",,Future Directions In …,,,,,,
3,Michael Bächle,Social software,2006.0,['M Bächle'],,Informatik-Spektrum,,,196.0,D,18.0,
4,Michael Bächle,Ruby on Rails.,2007.0,"[Paul Kirchberg, Michael Baechle, P Kirchberg,...",Article,IEEE software,,,138.0,C,169.0,
5,Michael Bächle,Gabler: Wirtschaftsinformatik Lexikon,2013.0,"['E Stickel', ' HD Groffmann', ' KH Rau']",,Springer-Verlag,,,111.0,,,
6,Michael Bächle,Ökonomische Perspektiven des Web 2.0–Open inno...,2008.0,['M Bächle'],,Wirtschaftsinformatik,,,70.0,,,
7,Michael Bächle,E-business: Grundlagen elektronischer Geschäft...,2010.0,"['M Bächle', ' FR Lehmann']",,Oldenbourg Verlag,,,49.0,,,
8,Michael Bächle,Einführung in die Wirtschaftsinformatik,2021.0,"['MA Bächle', ' S Daurer', ' A Kolb']",,Einführung in die Wirtschaftsinformatik,,,38.0,,,
9,Michael Bächle,Assistive technology for independent living wi...,2018.0,"['M Bächle', ' S Daurer', ' A Judt', ' T Mettl...",,Health policy and technology,,,19.0,B,92.0,
