In [72]:
#num_search_queries = len(results)
#size_of_query = -np.log(num_search_queries/total_entries_in_yr) # perform log transformation to work with larger numbers
#size_of_query

5.775749585371992

In [None]:
# future work: use publishDate instead 

In [55]:
%reset -f

import pandas as pd
import numpy as np

import json
from collections import Counter

from finna_client import FinnaClient as fc
from finna_client import FinnaSearchType as fst 

#################################

#### USER INPUT / PARAMETERS ####
search_query = "EEG"
year = 2018              # SLIDER
#################################

fc = fc() 

# input: int year
# output: int total entries by year
def get_total_entries_by_yr(year):
    return fc.search(lookfor="",
           search_type=fst.Subject,
           fields=["year"],
           filters=[("main_date_str:"+str(year))],
           page=1,
           limit=100)['resultCount']


def get_response(pg, search_query):
    return fc.search(lookfor=search_query,
           search_type=fst.Subject,
           fields=["title", "buildings", "subjects"],
           filters=[("main_date_str:"+str(year))],
           facets=["author"],
           page=pg,
           limit=100)


def get_entries(year, search_query):
    total_entries_in_yr = get_total_entries_by_yr(year)

    total_pages = 0
    results = []
    while True:
        try:
            response = get_response(total_pages, search_query)
            results += response['records']
            total_pages += 1

            # stopping condition
            if len(results) >= response['resultCount']:
                break     
                
        except:
            print("num collected entries: ", len(results))
            print("num expected entries: ", response['resultCount'])
            break
    
    num_search_queries = len(results)
            
    data = json.dumps(results)
    df = pd.read_json(data)
    
    return df

def get_num_entry_by_org_size_scores(df, year):
    num_search_queries = len(df)
    
    buildings = []

    for i in range(num_search_queries):
        buildings.append(df['buildings'][i][0]['translated'])

    unique_org = Counter(buildings).keys() # equals to list(set(words))
    num_entries_by_org = Counter(buildings).values() # counts the elements' frequency
    by_org_size_scores = [np.sqrt(x/num_search_queries)*100 for x in num_entries_by_org]

    num_entry_by_org_size_scores = list(zip(unique_org, by_org_size_scores))

    institutions = []
    size_values = []
    for i in range(len(num_entry_by_org_size_scores)):
        institutions.append(num_entry_by_org_size_scores[i][0])
        size_values.append(num_entry_by_org_size_scores[i][1])

    json_data = {
        "institutions" : institutions,
        "size_values" : size_values
    }
    
    return json_data


# co-occurrence matrix, work from 2000 - 2018
def get_cooccurrence_matrix(df):
    #df['subjects'].apply(pd.Series).stack().unique()
    #list(set([a for b in df.val.tolist() for a in b]))

    # toy example
    # sample = [[[1]],[[1,1]],[[1,1,1]]]
    # pd.DataFrame(sample).values.argmax()

    #num_search_queries = len(df)
    #idx_of_max_num_subjects = df['subjects'].values.argmax()
    #len(df.loc[idx_of_max_num_subjects]['subjects'])

    # to lowercase and strip ( . )
    # toy example
    # sample = [[[["Aa"]]],[[["Bb"]]],[[["Cc."],["Dd"]]]]
    # pd.DataFrame(sample).loc[2][0][0][0].lower().replace('.','')

    subjects = [a[0].lower().replace('.','') for b in df['subjects'].tolist() for a in b]

    unique_subject_list = list(set(subjects))
    len(unique_subject_list)

    subject_frequency = Counter(subjects).most_common() # counts the elements' frequency

    # more data cleaning -> plural cases

    # get the top commonly seen keywords occurring with the search query
    TOP_VALUES = 10

    key_subjects = []
    for i in range(TOP_VALUES):
        key_subjects.append(subject_frequency[i][0])

    mat = np.zeros((len(df['subjects']), len(key_subjects)))


    #subjects_by_publication
    for x in range(len(df['subjects'])):
        for y in range(len(key_subjects)):
            for idx in range(len(df['subjects'][x])):
                if df['subjects'][x][idx][0].lower().replace('.','') == key_subjects[y]:
                    mat[x][y] += 1

    co_mat = pd.DataFrame(mat.T.dot(mat), columns=key_subjects).astype(int)
    np.fill_diagonal(co_mat.values, 0) # fill diagonal with 0

    comat_json = {
        "data": co_mat.values.tolist(),
        "label": key_subjects
    }
    
    return comat_json



In [58]:
#### USER INPUT / PARAMETERS ####
search_query = "EEG"
year = 2018             # SLIDER
#################################

df = get_entries(year, search_query)


In [59]:
get_num_entry_by_org_size_scores(df, year)['size_values']

[80.0, 56.568542494923804, 20.0]

In [52]:
get_num_entry_by_org_size_scores(df, year)['size_values']

[52.3450093132096,
 12.649110640673516,
 17.88854381999832,
 14.142135623730951,
 19.493588689617926,
 56.39148871948674,
 17.320508075688775,
 10.0,
 31.937438845342626,
 8.94427190999916,
 12.649110640673516,
 6.324555320336758,
 25.690465157330262,
 6.324555320336758,
 4.47213595499958,
 7.745966692414834,
 4.47213595499958,
 11.832159566199232,
 14.832396974191326,
 4.47213595499958,
 4.47213595499958,
 4.47213595499958,
 4.47213595499958,
 4.47213595499958,
 4.47213595499958,
 4.47213595499958,
 4.47213595499958]

In [46]:
# get_num_entry_by_org_size_scores(df, year)
comat = get_cooccurrence_matrix(df)
#comat.to_csv('comat.csv', sep='\t', encoding='utf-8')
#comat.values

{'data': array([[  0, 115,  78,  58,  40,  40,  35,  33,  32,  26],
        [115,   0,  31,  50,  18,  22,   8,   4,  16,  17],
        [ 78,  31,   0,  36,  19,   9,   6,  11,  11,  10],
        [ 58,  50,  36,   0,  17,   4,   0,   8,  12,  11],
        [ 40,  18,  19,  17,   0,   2,   2,   4,   8,  10],
        [ 40,  22,   9,   4,   2,   0,   5,   0,   1,   3],
        [ 35,   8,   6,   0,   2,   5,   0,   4,   2,   2],
        [ 33,   4,  11,   8,   4,   0,   4,   0,   8,   1],
        [ 32,  16,  11,  12,   8,   1,   2,   8,   0,   5],
        [ 26,  17,  10,  11,  10,   3,   2,   1,   5,   0]]),
 'label': ['eeg',
  'electroencephalography',
  'aivot',
  'brain',
  'meg',
  'epilepsia',
  'lapset',
  'neuropsykologia',
  'aivotutkimus',
  'aivokuori']}

In [26]:
comat

Unnamed: 0,eeg,electroencephalography,aivot,brain,meg,epilepsia,lapset,neuropsykologia,aivotutkimus,aivokuori
0,0,115,78,58,40,40,35,33,32,26
1,115,0,31,50,18,22,8,4,16,17
2,78,31,0,36,19,9,6,11,11,10
3,58,50,36,0,17,4,0,8,12,11
4,40,18,19,17,0,2,2,4,8,10
5,40,22,9,4,2,0,5,0,1,3
6,35,8,6,0,2,5,0,4,2,2
7,33,4,11,8,4,0,4,0,8,1
8,32,16,11,12,8,1,2,8,0,5
9,26,17,10,11,10,3,2,1,5,0


In [47]:
get_num_entry_by_org_size_scores(df, year)

{'institutions': ['Jyväskylän yliopisto',
  'Turun ammattikorkeakoulu',
  'Kansalliskirjasto',
  'Turun yliopisto',
  'Tampereen ammattikorkeakoulu',
  'Helka-kirjastot',
  'Oulun yliopisto',
  'Savonia-ammattikorkeakoulu',
  'Itä-Suomen yliopisto',
  'Laurea-kirjasto',
  'Oulun ammattikorkeakoulun kirjasto',
  'Taideyliopisto',
  'Aalto-yliopisto',
  'Svenska handelshögskolan',
  'Helsingin yliopistomuseo',
  'Tampereen teknillinen yliopisto',
  'Tritonia',
  'Metropolian kirjaston kokoelmat',
  'Åbo Akademis bibliotek',
  'Arcadan kirjasto',
  'Tampereen yliopisto',
  'Lappeenrannan tiedekirjasto',
  'Karelia-ammattikorkeakoulu',
  'Varastokirjasto',
  'Blanka-kirjastot',
  'Centria-kirjasto',
  'Eepos-kirjastot'],
 'size_values': [52.3450093132096,
  12.649110640673516,
  17.88854381999832,
  14.142135623730951,
  19.493588689617926,
  56.39148871948674,
  17.320508075688775,
  10.0,
  31.937438845342626,
  8.94427190999916,
  12.649110640673516,
  6.324555320336758,
  25.6904651573