In [214]:
import pandas as pd
import os
import re
from collections import Counter
import math
import numpy as np

# progress bar
from ipywidgets import IntProgress
from IPython.display import display
import time

In [187]:
columns = ['AUTHOR_NAME', 'PUB_TITLE', 'PUB_YEAR', 'PUB_AUTHORS', 'PUB_NUMBER_AUTHORS', 'PUB_TYPE',
       'PUB_PUBLISHER', 'PUB_ISBN', 'PUB_DOI', 'PUB_CITATIONS',
       'JOU_RATING_VHB', 'JOU_RATING_SCIMAGO', 'UPDATED', 'YEAR_STORED',
       'SITE']
df_all_sites = pd.DataFrame(columns=columns)

In [188]:
files_in_data = os.listdir("../data")

WORD = re.compile(r"\w+")
p = re.compile('[0-9]{4}')
year_list = [ s for s in files_in_data if p.match(s)]

In [189]:
# re.sub("\[[A-Z ]+\]", "", title).strip()

In [190]:
def list_value_counts(list_string):
    clean_list = list_string.replace("'", "").split(",")
    return len(clean_list)

In [207]:
# cleaning the journal names

def clean_journals(journal_name):

    # remove everything within brackets
    journal_name = re.sub('\(.+\)', '', journal_name)
    journal_name = re.sub('\[.+\]', '', journal_name)
    
    # remove everythning after period
    journal_name = journal_name.split(".")[0]
    
    # remove non-word characters at the end
    journal_name = re.sub('\W*$', '', journal_name)
    
    journal_name = journal_name.strip()
    
    return journal_name

def get_alternate_journal_names(journal_string):
    
    journal_string = journal_string.replace("früher: ", "").replace(':','(').replace('-','(').replace('.','(')
    alternate_journal_names = list(map(clean_journals, journal_string.split("(", 2)))
    alternate_journal_names.sort(key = len, reverse=True)
    
    return alternate_journal_names

In [192]:
### Journal ratings
J_RATINGS_SCIMAGO = pd.read_csv(f'../data/journal_ratings_scimago.csv', sep=";")
J_RATINGS_VHB = pd.read_csv(f'../data/journal_ratings_vhb.csv')

# create columns in journal databases
J_RATINGS_SCIMAGO[['Title', 'ALTERNATE1', 'ALTERNATE2']] = pd.DataFrame(J_RATINGS_SCIMAGO['Title']
                                                                        .apply(get_alternate_journal_names).tolist(), 
                                                                        index = J_RATINGS_SCIMAGO.index)
J_RATINGS_VHB[['JOURNAL', 'ALTERNATE1', 'ALTERNATE2']] = pd.DataFrame(J_RATINGS_VHB['JOURNAL']
                                                                      .apply(get_alternate_journal_names).tolist(), 
                                                                      index = J_RATINGS_VHB.index)

In [194]:
# functions to get similarity of strings using cosine similarity

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)


    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def is_similar(text1, text2):
    
    vector1 = text_to_vector(str(text1).lower())
    vector2 = text_to_vector(str(text2).lower())

    cosine = get_cosine(vector1, vector2)
    
    if cosine > 0.6:
        similar = True
    else:
        similar = False
        
    return similar

def get_ratings(publisher):
     
    publisher = str(publisher).lower()

    if publisher == "nan":
        rating_vhb = None
        issn_vhb = None
        rating_sci = None
        issn_sci = None
        
    else:
        
        rating_database = J_RATINGS_VHB
        rating_database['JOURNAL'] = rating_database['JOURNAL'].str.lower()
        rating_database['ALTERNATE1'] = rating_database['ALTERNATE1'].str.lower()
        rating_database['ALTERNATE2'] = rating_database['ALTERNATE2'].str.lower()
        
        
        rating_vhb = None
        issn_vhb = None
        
        
        for journal_string in get_alternate_journal_names(publisher):
            
            
            if publisher in rating_database['JOURNAL'].values:
                rating_vhb = rating_database.loc[rating_database['JOURNAL'] == publisher, 'JQ3'].to_numpy()[0]
                
            if publisher in rating_database['ALTERNATE1'].values:
                rating_vhb = rating_database.loc[rating_database['ALTERNATE1'] == publisher, 'JQ3'].to_numpy()[0]
                
            if publisher in rating_database['ALTERNATE2'].values:
                rating_vhb = rating_database.loc[rating_database['ALTERNATE2'] == publisher, 'JQ3'].to_numpy()[0]
                
            elif index_similar := [i for i, x in enumerate(rating_database['JOURNAL']) if is_similar(publisher,x)]:
                rating_vhb = rating_database.loc[index_similar[0], 'JQ3']  

        
        rating_database = J_RATINGS_SCIMAGO
        
        rating_database['Title'] = rating_database['Title'].str.lower()
        rating_database['ALTERNATE1'] = rating_database['ALTERNATE1'].str.lower()
        rating_database['ALTERNATE2'] = rating_database['ALTERNATE2'].str.lower()
                
        rating_sci = None
        issn_sci = None
        
        for journal_string in get_alternate_journal_names(publisher):
            
            if publisher in rating_database['Title'].str.lower().values:
                rating_sci = rating_database.loc[rating_database['Title'] == publisher, 'H index'].to_numpy()[0]

            if publisher in rating_database['ALTERNATE1'].str.lower().values:
                rating_sci = rating_database.loc[rating_database['ALTERNATE1'] == publisher, 'H index'].to_numpy()[0]

            if publisher in rating_database['ALTERNATE2'].str.lower().values:
                rating_sci = rating_database.loc[rating_database['ALTERNATE2'] == publisher, 'H index'].to_numpy()[0]
                
            elif index_similar := [i for i, x in enumerate(rating_database['Title']) if is_similar(publisher,x)]:
                rating_sci = rating_database.loc[index_similar[0], 'H index']

    f.value += 1 # signal to increment the progress bar
            
    return rating_vhb, rating_sci

# 'JOU_RATING_VHB', 'JOU_RATING_SCIMAGO'

In [195]:
for year in year_list:

    files_in_year = os.listdir(f"../data/{year}")

    q = re.compile('publications_[a-z-]+.csv')
    publication_file_list = [ s for s in files_in_year if q.match(s)]
    
    for publication_file_name in publication_file_list:
        site = publication_file_name.replace('publications_', '').replace('.csv', '')
        
        site_publications = pd.read_csv(f'../data/{year}/{publication_file_name}')
        
        site_publications["YEAR_STORED"] = year
        site_publications["SITE"] = site
        
        # count number of authors
        site_publications["PUB_NUMBER_AUTHORS"] = site_publications["PUB_AUTHORS"].apply(list_value_counts).copy()

        df_all_sites = df_all_sites.append(site_publications)
        

In [196]:
### progress bar
max_count = len(df_all_sites)

f = IntProgress(min=0, max=max_count) # instantiate the bar
display(f) # display the bar

### progress bar end 

#clean
# insert journal ratings
df_all_sites[['JOU_RATING_VHB', 'JOU_RATING_SCIMAGO']] = df_all_sites.apply(\
            lambda row: get_ratings(row['PUB_PUBLISHER']), axis=1, result_type='expand')

IntProgress(value=0, max=16729)

In [222]:
df_all_sites['PUB_PUBLISHER'] = df_all_sites.apply(lambda row: clean_journals(row['PUB_PUBLISHER']) if(np.all(pd.notnull(row['PUB_PUBLISHER']))) else None, axis = 1)

In [224]:
df_all_sites.to_csv(f'../data/publications_all_sites_all_years.csv', index=False)

In [221]:
#df_all_sites = pd.read_csv(f'../data/publications_all_sites_all_years.csv')