# Load bibtex files and transform into pickle file for further processing

- read scraped bibtext files into dataframe/pickle table
- some basic preprocessing

## 1. Import/Export to pickle

In [29]:
import bibtexparser
import os
import pandas as pd
from tqdm import tqdm

# get list of file names in directory where bibtex files are saved
files = os.listdir("scrape_data/")

In [30]:
import re

# define function to extract keyword from file names
def extract_kw(q):
    m = re.search("[a-z]+([_&]?[a-z]+)+",q)
    return m.group(0)


# extract keywords
kw = [extract_kw(f) for f in files]

In [31]:
files

['bucharest_01.bib',
 'bucharest_02.bib',
 'bucharest_11.bib',
 'budapest_01.bib',
 'budapest_02.bib',
 'budapest_11.bib',
 'budapest_21.bib',
 'czech_01.bib',
 'czech_02.bib',
 'czech_101.bib',
 'czech_11.bib',
 'czech_111.bib',
 'czech_12.bib',
 'czech_121.bib',
 'czech_131.bib',
 'czech_141.bib',
 'czech_151.bib',
 'czech_161.bib',
 'czech_21.bib',
 'czech_22.bib',
 'czech_31.bib',
 'czech_32.bib',
 'czech_41.bib',
 'czech_42.bib',
 'czech_51.bib',
 'czech_52.bib',
 'czech_61.bib',
 'czech_71.bib',
 'czech_81.bib',
 'czech_91.bib',
 'hungary_01.bib',
 'hungary_02.bib',
 'hungary_101.bib',
 'hungary_11.bib',
 'hungary_111.bib',
 'hungary_12.bib',
 'hungary_121.bib',
 'hungary_131.bib',
 'hungary_141.bib',
 'hungary_151.bib',
 'hungary_161.bib',
 'hungary_171.bib',
 'hungary_181.bib',
 'hungary_21.bib',
 'hungary_22.bib',
 'hungary_31.bib',
 'hungary_32.bib',
 'hungary_41.bib',
 'hungary_42.bib',
 'hungary_51.bib',
 'hungary_52.bib',
 'hungary_61.bib',
 'hungary_71.bib',
 'hungary_81.

In [32]:
kw[:5]

['bucharest', 'bucharest', 'bucharest', 'budapest', 'budapest']

In [33]:
# load bibtext files as pandas dataframes into list

dfs = []
for f in tqdm(files): #loop through files
    
    # open file and read file into cache
    with open(f'scrape_data/{f}', encoding="utf8") as bibtex_file:
        bibtex_str = bibtex_file.read()

    # parse bibtex file
    bib_database = bibtexparser.loads(bibtex_str)
    
    # convert to pandas dataframe and append to fds list
    as_df = pd.DataFrame(bib_database.entries)
    dfs.append(as_df)

100%|████████████████████████████████████████████████████████████████████████████████| 291/291 [31:39<00:00,  6.53s/it]


In [34]:
# add keyword row
for i,j in enumerate(kw):
    dfs[i]["kw_cat"] = j

# concat all dataframes to one
df = pd.concat(dfs)

# all cells contain curly brackets, erase these
df = df.apply(lambda x: x.str.strip("{}") if x.dtype == "object" else x)


In [35]:
df.head()

Unnamed: 0,da,unique-id,web-of-science-index,doc-delivery-number,journal-iso,usage-count-since-2013,usage-count-last-180-days,times-cited,number-of-cited-references,orcid-numbers,...,esi-highly-cited-paper,kw_cat,earlyaccessdate,series,booktitle,note,organization,isbn,editor,book-group-author
0,2022-07-25,WOS:000456043800016,Science Citation Index Expanded (SCI-EXPANDED),HH9GD,Environ. Eng. Manag. J.,22,2,1,29,"ionita, daniela/0000-0002-1882-4871\nProdana, ...",...,,bucharest,,,,,,,,
1,2022-07-25,WOS:000431040600034,Science Citation Index Expanded (SCI-EXPANDED),GE2JD,J. Radioanal. Nucl. Chem.,13,0,1,16,"Luculescu, Catalin R/0000-0001-6852-401X",...,,bucharest,,,,,,,,
2,2022-07-25,WOS:000440382200008,Emerging Sources Citation Index (ESCI),GO8WY,Philol. Jassyensia,0,0,0,25,,...,,bucharest,,,,,,,,
3,2022-07-25,WOS:000416769400011,Science Citation Index Expanded (SCI-EXPANDED),FO3XC,Gut,5,0,34,32,"ensari, arzu/0000-0001-7036-4457\nRostami-Neja...",...,,bucharest,,,,,,,,
4,2022-07-25,WOS:000416545200131,Science Citation Index Expanded (SCI-EXPANDED)...,FO1SQ,Int. J. Environ. Res. Public Health,9,1,11,27,"Paraschiv, Lizica Simona/0000-0002-9109-9151\n...",...,,bucharest,,,,,,,,


In [36]:
from collections import Counter

Counter(df["kw_cat"])

Counter({'bucharest': 2336,
         'budapest': 3154,
         'czech': 22256,
         'hungary': 23308,
         'italy': 130676,
         'lisbon': 5268,
         'portugal': 34599,
         'prague': 5490,
         'romania': 27818,
         'rome': 22350,
         'vatican': 133})

## 2. Aggregate Duplicates (papers returned by multiple keywords)


#### find duplicates and keep kw info for all of them

In [37]:
from collections import Counter

# generate dummyvariables for keywords and add to dataframe
dummy_kw = pd.get_dummies(df["kw_cat"])
df_new = pd.concat([df, dummy_kw], axis=1)

# count duplicates based on unique-id
Counter(df_new.duplicated(["unique-id"],keep=False))

Counter({True: 37670, False: 239718})

In [38]:
# erase nan rows in unique id
# print number nans
print(df_new['unique-id'].isnull().values.sum())
df_new.dropna(subset=["unique-id"], inplace=True)
print(df_new['unique-id'].isnull().values.sum())

2
0


In [39]:
# get a dataframe containg the duplicates and one containing no duplicates

df_dupli = df_new[df_new.duplicated(["unique-id"],keep=False)]

df_no_dupli = df_new[~df_new.duplicated(["unique-id"],keep=False)]

In [40]:
print(len(df_no_dupli),len(df_dupli))

239718 37668


In [41]:
print(len(df),len(df_dupli),len(df_no_dupli))

277388 37668 239718


In [42]:
# 1. get set of unique-ids of duplicates

dupli = set(df_dupli["unique-id"][df_dupli.duplicated(["unique-id"])])

# 2. make data set for duplicates with summed up dummies

df_agg = pd.DataFrame(columns=df_dupli.columns) #empty dataset with correct columns

# loop through unique ids of duplicates and um up there dummy vectors
# add then to df_agg that aggregates the dummys
for ID in tqdm(dupli):
    
    agg_dummies = df_dupli[df_dupli["unique-id"]==ID][dummy_kw.columns].sum(axis=0)
    dummy_df = pd.DataFrame([agg_dummies])
    rest_df = df_dupli[df_dupli["unique-id"]==ID]
    rest_df.reset_index(inplace=True, drop=True)
    rest_df = rest_df.iloc[0][~df_dupli.columns.isin(dummy_kw.columns)]
    rest_df = pd.DataFrame([rest_df])

    new_row = pd.concat([rest_df,dummy_df],axis=1)

    df_agg = pd.concat([df_agg,new_row])

100%|████████████████████████████████████████████████████████████████████████████| 17994/17994 [07:14<00:00, 41.39it/s]


In [47]:
#### all rows contain at least one 1

dummies = df_agg.iloc[:,50:]
c = 0
for index, row in dummies.iterrows():
    c+=1
    if sum(row) == 0:
        print(row)
        

In [48]:
#### sum duplicates had indicators at the same keyword
#### appeared multiple times in same search
# make sure that values higher than 1 are replaced by 1
#  to avoid issues later on

dummies = df_agg.iloc[:,50:]


In [49]:
# switch all positive entries to 1
dummies[dummies >= 1] = 1

for index, row in dummies.iterrows():
    if sum(row) == 0:
        print(row)


In [50]:

# check id each col has only zeroes and ones
for col in dummies.columns:
    print(dummies[col].unique())
    



[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]


In [51]:
df_agg.iloc[:,50:] = dummies

In [52]:
df_agg.columns

Index(['da', 'unique-id', 'web-of-science-index', 'doc-delivery-number',
       'journal-iso', 'usage-count-since-2013', 'usage-count-last-180-days',
       'times-cited', 'number-of-cited-references', 'orcid-numbers',
       'researcherid-numbers', 'author-email', 'web-of-science-categories',
       'research-areas', 'keywords-plus', 'keywords', 'eissn', 'issn',
       'affiliation', 'language', 'type', 'address', 'publisher', 'abstract',
       'month', 'pages', 'number', 'volume', 'year', 'journal', 'title',
       'author', 'ENTRYTYPE', 'ID', 'funding-text', 'funding-acknowledgement',
       'doi', 'oa', 'article-number', 'esi-hot-paper',
       'esi-highly-cited-paper', 'kw_cat', 'earlyaccessdate', 'series',
       'booktitle', 'note', 'organization', 'isbn', 'editor',
       'book-group-author', 'bucharest', 'budapest', 'czech', 'hungary',
       'italy', 'lisbon', 'portugal', 'prague', 'romania', 'rome', 'vatican'],
      dtype='object')

In [53]:
# put together the aggregated duplicates and the no duplicates dataframe to receive the final df

df_final = pd.concat([df_agg,df_no_dupli])

In [54]:
df_final

Unnamed: 0,da,unique-id,web-of-science-index,doc-delivery-number,journal-iso,usage-count-since-2013,usage-count-last-180-days,times-cited,number-of-cited-references,orcid-numbers,...,budapest,czech,hungary,italy,lisbon,portugal,prague,romania,rome,vatican
0,2022-07-25,WOS:A1997YF94400004,Science Citation Index Expanded (SCI-EXPANDED),YF944,Mineral. Mag.,3,0,34,41,,...,0,0,0,1,0,0,0,0,1,0
0,2022-07-25,WOS:000278865400001,Arts &amp; Humanities Citation Index (A&amp;HCI),611ZW,Hist. Polit. Thought,1,0,14,53,,...,0,0,0,1,0,0,0,0,1,0
0,2022-07-25,WOS:000265439800025,Science Citation Index Expanded (SCI-EXPANDED),436TU,Aquat. Ecol.,31,0,25,45,"Fanelli, Giuliano/0000-0002-3143-1212",...,0,0,0,1,0,0,0,0,1,0
0,2022-07-25,WOS:000245709800019,Science Citation Index Expanded (SCI-EXPANDED),157HP,Vet. Parasitol.,29,3,110,41,"Vadlejch, Jaroslav/0000-0002-5958-7606\nPekar,...",...,0,1,0,0,0,0,1,0,0,0
0,2022-07-25,WOS:000600652300243,Emerging Sources Citation Index (ESCI),PH8JS,Data Brief,4,1,0,6,"de Almeida, Ana Maria/0000-0001-9519-4634\nNun...",...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2022-07-25,WOS:000727322500001,Science Citation Index Expanded (SCI-EXPANDED),1T5XY,Otolaryngol. Head Neck Surg.,1,1,1,42,"Ralli, Massimo/0000-0001-8776-0421",...,0,0,0,0,0,0,0,0,0,1
38,2022-07-25,WOS:000478585400040,Arts &amp; Humanities Citation Index (A&amp;HCI),IN3OE,Religions,0,0,0,33,,...,0,0,0,0,0,0,0,0,0,1
42,2022-07-25,WOS:000712542100002,Science Citation Index Expanded (SCI-EXPANDED),WO6DE,Eur. Rev. Med. Pharmacol. Sci.,0,0,0,38,,...,0,0,0,0,0,0,0,0,0,1
43,2022-07-25,WOS:000477789200002,Arts &amp; Humanities Citation Index (A&amp;HCI),IM1ZA,Milli Folklor,1,0,1,8,"Ekici, Metin/0000-0002-9400-8462",...,0,0,0,0,0,0,0,0,0,1


In [55]:
## save as pickle!!!

import pickle

with open('dataframe_papers.pickle', 'wb') as f:
    pickle.dump(df_final, f)