In [22]:
# change to store dataset
STORE_PUB = True

# change for different location
# locations: "ravensburg", "mannheim", "heidenheim", "karlsruhe", "campus-horb", "stuttgart"

location = "ravensburg"

In [23]:
import math
import re
from collections import Counter

import datetime
import pandas as pd
import ast

In [24]:
WORD = re.compile(r"\w+")

# get current year to save in applicable folder
current_year = datetime.date.today().year

In [25]:
### data import ###
# locations: "ravensburg", "mannheim", "heidenheim", "karlsruhe", "campus-horb", "stuttgart"

J_RATINGS_SCIMAGO = pd.read_csv(f'../data/{current_year}/journal_ratings_scimago.csv', sep=";")
J_RATINGS_VHB = pd.read_csv(f'../data/{current_year}/journal_ratings_vhb.csv')

publications_gs = pd.read_csv(f'../data/{current_year}/publications_{location}_gs.csv')
publications_rg = pd.read_csv(f'../data/{current_year}/publications_{location}_rg.csv')

In [26]:
# functions to get similarity of strings using cosine similarity

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)


    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [27]:
def is_similar(text1, text2):
    
    vector1 = text_to_vector(text1.lower())
    vector2 = text_to_vector(text2.lower())

    cosine = get_cosine(vector1, vector2)
    
    if cosine > 0.6:
        similar = True
    else:
        similar = False
        
    return similar

In [28]:
def get_not_none_string(string1, string2):
    
    if string1 and not string2:
        return string1
    
    elif string2 and not string1:
        return string2
    
    elif string1 and string2:
        return max([string1, string2], key=len)
    
    else:
        return None

In [29]:
def get_ratings(publisher):
    publisher = str(publisher)

    rating_database = J_RATINGS_VHB
    if publisher in rating_database['JOURNAL']:
        rating_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'JQ3'].to_numpy()[0]
        issn_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'ISSN'].to_numpy()[0]

    elif index_similar := [i for i, x in enumerate(rating_database['JOURNAL']) if is_similar(publisher,x)]:
        rating_vhb = rating_database.loc[index_similar[0], 'JQ3']
        issn_vhb = rating_database.loc[index_similar[0], 'ISSN']

    else:
        rating_vhb = None
        issn_vhb = None
        
    
    rating_database = J_RATINGS_SCIMAGO
    if publisher in rating_database['Title']:
        rating_sci = rating_database.loc[rating_database['Title'] == publisher, 'H index'].to_numpy()[0]
        issn_sci = rating_database.loc[rating_database['Title'] == publisher, 'Issn'].to_numpy()[0]

    elif index_similar := [i for i, x in enumerate(rating_database['Title']) if is_similar(publisher,x)]:
        rating_sci = rating_database.loc[index_similar[0], 'H index']
        issn_sci = rating_database.loc[index_similar[0], 'Issn']

    else:
        rating_sci = None
        issn_sci = None
       

#     if issn_vhb and issn_sci:
#         if issn_vhb != issn_sci:
#             print(f"ISSN are not the same for found publisher: {publisher}.")
#         else:
#             print(f"ISSN are the same for found publisher: {publisher}.")

        
    return rating_vhb, rating_sci

In [30]:
def combine_publication(row_gs, row_rg):
    
    title = get_not_none_string(str(row_gs['PUB_TITLE']), str(row_rg['PUB_TITLE']))
    year = max([row_gs['PUB_YEAR'], row_rg['PUB_YEAR']])
    publisher = get_not_none_string(str(row_gs['PUB_PUBLISHER']), str(row_rg['PUB_PUBLISHER']))
    
    authors_gs = [n.strip() for n in ast.literal_eval(row_gs["PUB_AUTHORS"])]
    authors_rg = [n.strip() for n in ast.literal_eval(row_rg["PUB_AUTHORS"])]
    authors = list(set(authors_gs + authors_rg))
    
    data = {
        'AUTHOR_NAME': row_gs["AUTHOR_NAME"],
        'PUB_TITLE': title, 
        'PUB_YEAR': year, 
        'PUB_AUTHORS': authors,  
        'PUB_TYPE': row_rg['PUB_TYPE'],
        'PUB_PUBLISHER': publisher,
        'PUB_CITATIONS': row_gs["PUB_CITATIONS"],
        'PUB_ISBN': row_rg['PUB_ISBN'],
        'PUB_DOI': row_rg['PUB_DOI']
       }

    return data

In [31]:
# get all unique names
all_author_names = publications_gs['AUTHOR_NAME'].tolist() + publications_rg['AUTHOR_NAME'].tolist()
all_author_names = set(all_author_names) # make unique

In [32]:
pd_result = pd.DataFrame(columns=['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 
                                                      'PUB_AUTHORS', 'PUB_TYPE', 'PUB_PUBLISHER', 
                                                      'PUB_ISBN', 'PUB_DOI', 'PUB_CITATIONS', 'JOU_RATING_VHB', 'JOU_RATING_SCIMAGO'])

In [33]:

for author_name in all_author_names:
    
    subset_test_gs = publications_gs[publications_gs['AUTHOR_NAME'] == author_name]
    subset_test_rg = publications_rg[publications_rg['AUTHOR_NAME'] == author_name]

    leftover_rg = subset_test_rg.copy()

    for index_gs, first_row_gs in subset_test_gs.iterrows():
        match_found = False

        # iterate over RG subset
        for index_rg, row in subset_test_rg.iterrows():

            # determining identical based on title and publisher
            if is_similar(row["PUB_TITLE"], first_row_gs["PUB_TITLE"]) and \
                is_similar(str(row["PUB_PUBLISHER"]), str(first_row_gs["PUB_PUBLISHER"])): 

                match_found = True

                combined_row = combine_publication(first_row_gs, row)
                pd_result = pd_result.append(combined_row, ignore_index=True, sort=False)

                # remove the found match from the subsets
                #subset_test_gs = subset_test_gs.drop(index_gs).reset_index(drop=True)
                if index_rg in leftover_rg.index:
                    leftover_rg = leftover_rg.drop(index_rg)

        if not match_found:
            pd_result = pd_result.append(first_row_gs, ignore_index=True, sort=False)
            #subset_test_gs = subset_test_gs.drop(index_gs).reset_index(drop=True)

        # if there are RG articles left, append them
        if (index_gs + 1 == len(subset_test_gs)) and (len(leftover_rg) > 0):
            pd_result = pd_result.append(leftover_rg, ignore_index=True, sort=False)


pd_result[['JOU_RATING_VHB', 'JOU_RATING_SCIMAGO']] = pd_result.apply(lambda row: get_ratings(row['PUB_PUBLISHER']), axis=1, result_type='expand')

In [34]:
if STORE_PUB:
    pd_result.to_csv(f'../data/{current_year}/publications_{location}.csv', index=False)

In [35]:
pd_result.head(20)

Unnamed: 0,AUTHOR_NAME,PUB_TITLE,PUB_YEAR,PUB_AUTHORS,PUB_TYPE,PUB_PUBLISHER,PUB_ISBN,PUB_DOI,PUB_CITATIONS,JOU_RATING_VHB,JOU_RATING_SCIMAGO
0,Thomas Asche,Validation of the COMPASS force field for comp...,2017.0,"['TS Asche', ' P Behrens', ' AM Schneider']",,,,,14.0,,8.0
1,Thomas Asche,[BOOK][B] Das Sicherheitsverhalten von Konsume...,1990.0,['T Asche '],,,,,26.0,,8.0
2,Thomas Asche,Die Ergebnisse der empirischen Analyse zum Sic...,1990.0,['T Asche'],,Das Sicherheitsverhalten von Konsumenten,,,,,
3,Thomas Asche,Atomistic simulation of sol-gel-derived hybrid...,2018.0,"['Thomas Sebastian Asche', 'M. Duderstaedt', '...",Chapter,,,10.1007/978-3-319-32101-1_109,,,8.0
4,Thomas Asche,Validation of the COMPASS force field for comp...,2017.0,"['Thomas Sebastian Asche', 'Peter Behrens', 'A...",Article,Journal of Sol-Gel Science and Technology,,10.1007/s10971-016-4185-y,,C,68.0
5,Thomas Asche,Atomistic Simulation of Sol–Gel-Derived Hybrid...,2016.0,"['Thomas Sebastian Asche', 'Mirja Duderstaedt'...",Chapter,,,10.1007/978-3-319-19454-7_109-1,,,8.0
6,Thomas Asche,Two-photon polymerization of inorganic-organic...,2015.0,"['F. Burmeister', 'Sönke Steenhusen', 'Ruth Ho...",Chapter,,,10.1515/9783110354324-016,,,8.0
7,Thomas Asche,Die Datengewinnung zur Analyse des Sicherheits...,1990.0,['T Asche'],,Das Sicherheitsverhalten von Konsumenten,,,,,
8,Atheer Al-Tameemi,Estimation of planetary surface ages using ima...,2018.0,['A Al'],,,,,,,8.0
9,Wolfgang Bihler,[BOOK][B] Weiterbildungserfolg in betriebliche...,2006.0,['W Bihler '],,,,,14.0,,8.0
