In [2]:
import re
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

In [64]:
def openfile(path):
    f=open(path)
    text=f.read()
    f.close() 
    all_articles=text.split("\n\n")
    return all_articles


def get_times(all_articles):
    split_doc_by_date = {}
    publish_date = {}
    count = 0
    for art in all_articles:
        years = re.findall(r'\sPY [0-9]+', art)
        if len(years)==1:
            count += 1
            year = years[0]
            year = year.split("PY")[1].strip()
            if year not in publish_date:
                publish_date[year] = 1
            else:
                publish_date[year] += 1
            if year not in split_doc_by_date:
                split_doc_by_date[year] = [art]
            else:
                split_doc_by_date[year].append(art)              
    print("actual count:",count)
    return publish_date, split_doc_by_date


def get_citations(split_doc_by_date):
    topics = ['bone','polymer','chitosan','tissue engineering','hydrogel','scaffold','nanoparticles',\
             'adhesion','regeneration','microstructure']
    citation_by_year = {}

    for year,docs in split_doc_by_date.items():
        citation_by_topic = {}
        for article in docs:
            NRs = re.findall(r'\sNR [0-9]+', article)
            if len(NRs)==1:
                NR = NRs[0]
                NR = NR.split("NR")[1].strip()
                lower_article = article.lower()
                for topic in topics:
                    if topic in lower_article:
                        if topic not in citation_by_topic:
                            citation_by_topic[topic] = int(NR)
                        else:
                            citation_by_topic[topic] += int(NR)
        citation_by_year[year] = citation_by_topic
    return citation_by_year,citation_by_topic  
        

if __name__ == "__main__":
    all_articles = []
    
    for i in range(1,42):
        path = "../data/biomaterial/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
    
    for i in range(1,48):
        path = "../data/biomedical_material/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
    
    for i in range(1,10):
        path = "../data/oa_biomaterial/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
        
    for i in range(1,10):
        path = "../data/oa_biomedical_material/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
    print("len total:",len(all_articles))
    
    publish_date, split_doc_by_date  = get_times(all_articles)
    print("publish date:",publish_date)
    citation_by_year,citation_by_topic = get_citations(split_doc_by_date)

len total: 52125
actual count: 52014
publish date: {'2018': 4875, '2017': 5905, '2016': 5041, '2015': 5096, '2014': 4767, '2013': 3853, '2012': 3335, '2011': 2842, '2010': 2696, '2009': 2190, '2008': 1874, '2007': 1639, '2006': 1206, '2005': 1060, '2004': 933, '2003': 691, '2002': 619, '2001': 521, '2000': 450, '1999': 436, '1998': 396, '1997': 337, '1996': 242, '1995': 180, '1994': 185, '1993': 191, '1992': 151, '1991': 122, '1990': 25, '1989': 14, '1988': 17, '1987': 18, '1986': 9, '1985': 13, '1984': 17, '1983': 7, '1982': 11, '1981': 4, '1980': 8, '1979': 2, '1978': 7, '1977': 3, '1976': 9, '1975': 5, '1974': 4, '1973': 3, '1972': 3, '2019': 2}


In [65]:
citation_by_year

{'2018': {'polymer': 158907,
  'chitosan': 26203,
  'bone': 94862,
  'tissue engineering': 74267,
  'hydrogel': 62601,
  'adhesion': 37829,
  'regeneration': 56000,
  'scaffold': 76519,
  'nanoparticles': 86712,
  'microstructure': 19531},
 '2017': {'polymer': 177353,
  'adhesion': 41666,
  'bone': 97802,
  'tissue engineering': 73907,
  'scaffold': 79037,
  'nanoparticles': 100221,
  'regeneration': 53572,
  'hydrogel': 61021,
  'chitosan': 24216,
  'microstructure': 16870},
 '2016': {'polymer': 146019,
  'hydrogel': 44573,
  'scaffold': 67391,
  'nanoparticles': 86023,
  'adhesion': 32944,
  'bone': 87298,
  'microstructure': 14671,
  'regeneration': 47490,
  'chitosan': 21793,
  'tissue engineering': 64866},
 '2015': {'polymer': 139340,
  'nanoparticles': 68945,
  'bone': 80644,
  'adhesion': 42061,
  'tissue engineering': 60593,
  'scaffold': 67601,
  'regeneration': 46433,
  'hydrogel': 47315,
  'chitosan': 16948,
  'microstructure': 16971},
 '2014': {'polymer': 124439,
  'bone': 

In [100]:
df0 = pd.DataFrame.from_dict(citation_by_year['2000'],orient='index',columns=['2000'])
df1 = pd.DataFrame.from_dict(citation_by_year['2001'],orient='index',columns=['2001'])
df2 = pd.DataFrame.from_dict(citation_by_year['2002'],orient='index',columns=['2002'])
df3 = pd.DataFrame.from_dict(citation_by_year['2003'],orient='index',columns=['2003'])
df4 = pd.DataFrame.from_dict(citation_by_year['2004'],orient='index',columns=['2004'])
df5 = pd.DataFrame.from_dict(citation_by_year['2005'],orient='index',columns=['2005'])
df6 = pd.DataFrame.from_dict(citation_by_year['2006'],orient='index',columns=['2006'])
df7 = pd.DataFrame.from_dict(citation_by_year['2007'],orient='index',columns=['2007'])
df8 = pd.DataFrame.from_dict(citation_by_year['2008'],orient='index',columns=['2008'])
df9 = pd.DataFrame.from_dict(citation_by_year['2009'],orient='index',columns=['2009'])
df10 = pd.DataFrame.from_dict(citation_by_year['2010'],orient='index',columns=['2010'])
df11 = pd.DataFrame.from_dict(citation_by_year['2011'],orient='index',columns=['2011'])
df12 = pd.DataFrame.from_dict(citation_by_year['2012'],orient='index',columns=['2012'])
df13 = pd.DataFrame.from_dict(citation_by_year['2013'],orient='index',columns=['2013'])
df14 = pd.DataFrame.from_dict(citation_by_year['2014'],orient='index',columns=['2014'])
df15 = pd.DataFrame.from_dict(citation_by_year['2015'],orient='index',columns=['2015'])
df16 = pd.DataFrame.from_dict(citation_by_year['2016'],orient='index',columns=['2016'])
df17 = pd.DataFrame.from_dict(citation_by_year['2017'],orient='index',columns=['2017'])
df18 = pd.DataFrame.from_dict(citation_by_year['2018'],orient='index',columns=['2018'])

In [105]:
df0

Unnamed: 0,2000
adhesion,3133
bone,4416
polymer,6497
hydrogel,1669
tissue engineering,1666
regeneration,1107
scaffold,735
chitosan,486
microstructure,655
nanoparticles,34


In [113]:
df = pd.merge(df0, df1, left_index=True, right_index=True)
df = pd.merge(df, df2, left_index=True, right_index=True)
df = pd.merge(df, df3, left_index=True, right_index=True)
df = pd.merge(df, df4, left_index=True, right_index=True)
df = pd.merge(df, df5, left_index=True, right_index=True)
df = pd.merge(df, df6, left_index=True, right_index=True)
df = pd.merge(df, df7, left_index=True, right_index=True)
df = pd.merge(df, df8, left_index=True, right_index=True)
df = pd.merge(df, df9, left_index=True, right_index=True)
df = pd.merge(df, df10, left_index=True, right_index=True)
df = pd.merge(df, df11, left_index=True, right_index=True)
df = pd.merge(df, df12, left_index=True, right_index=True)
df = pd.merge(df, df13, left_index=True, right_index=True)
df = pd.merge(df, df14, left_index=True, right_index=True)
df = pd.merge(df, df15, left_index=True, right_index=True)
df = pd.merge(df, df16, left_index=True, right_index=True)
df = pd.merge(df, df17, left_index=True, right_index=True)
df = pd.merge(df, df18, left_index=True, right_index=True)
df

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
adhesion,3133,5277,5010,4692,6756,7757,9781,10688,14476,15699,21498,23187,25499,29650,37813,42061,32944,41666,37829
bone,4416,8554,6482,7124,10984,11158,15545,21491,24954,29724,37471,42561,49985,60763,72326,80644,87298,97802,94862
polymer,6497,6240,8780,10643,14336,16421,23968,31701,42784,48136,62706,67499,80473,98028,124439,139340,146019,177353,158907
hydrogel,1669,1011,1633,1224,2753,2849,5013,5485,7347,10935,15004,19023,24215,30964,41810,47315,44573,61021,62601
tissue engineering,1666,2033,2415,3527,5626,6214,10066,17393,17040,21635,31789,37069,43379,49548,59680,60593,64866,73907,74267
regeneration,1107,1752,1609,1748,3073,3271,6473,8384,11207,12576,17325,20496,26672,31611,38889,46433,47490,53572,56000
scaffold,735,1299,1713,2817,4744,5885,9713,14268,18594,20533,30731,34105,39600,48708,66434,67601,67391,79037,76519
chitosan,486,391,289,1011,915,1519,2472,2293,5406,6403,6070,8215,11375,16535,16392,16948,21793,24216,26203
microstructure,655,606,796,818,1360,1846,2569,3415,2741,4364,6015,7123,9154,9257,14662,16971,14671,16870,19531
nanoparticles,34,712,106,709,1024,2264,3962,6829,14692,17346,21950,26854,33341,46459,59428,68945,86023,100221,86712


In [114]:
df.to_csv("../data/citation.csv",index=True)