In [13]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [16]:
def openfile(path):
    f=open(path)
    text=f.read()
    f.close() 
    all_articles=text.split("\n\n")
    return all_articles


def get_times(all_articles):
    split_doc_by_date = {}
    publish_date = {}
    count = 0
    for art in all_articles:
        years = re.findall(r'\sPY [0-9]+', art)
        if len(years)==1:
            count += 1
            year = years[0]
            year = year.split("PY")[1].strip()
            if year not in publish_date:
                publish_date[year] = 1
            else:
                publish_date[year] += 1
            if year not in split_doc_by_date:
                split_doc_by_date[year] = [art]
            else:
                split_doc_by_date[year].append(art)              
    print("actual count:",count)
    return publish_date, split_doc_by_date


def get_fundations(split_doc_by_date):
    topics = ['bone','polymer','chitosan','tissue engineering','hydrogel','scaffold','nanoparticles',\
             'adhesion','regeneration','microstructure']
    fundation_by_year = {}

    for year,docs in split_doc_by_date.items():
        fundation_by_topic = {}
        for article in docs:
            FUs = re.findall(r"\sFU [\w\s\W]*\nFX", article)
            if len(FUs)==1:
                FU = FUs[0].split("FX")[0].strip()
                FU = FU.split("FU")[1].strip()
                num_fu = len(FU.split(";"))
                lower_article = article.lower()
                for topic in topics:
                    if topic in lower_article:
                        if topic not in fundation_by_topic:
                            fundation_by_topic[topic] = num_fu
                        else:
                            fundation_by_topic[topic] += num_fu
        fundation_by_year[year] = fundation_by_topic
    return fundation_by_year,fundation_by_topic  
        

if __name__ == "__main__":
    all_articles = []
    
    for i in range(1,42):
        path = "../data/biomaterial/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
    
    for i in range(1,48):
        path = "../data/biomedical_material/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
    
    for i in range(1,10):
        path = "../data/oa_biomaterial/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
        
    for i in range(1,10):
        path = "../data/oa_biomedical_material/savedrecs%d.txt"%i
        articles = openfile(path)
        all_articles = all_articles + articles
    print("len total:",len(all_articles))
    
    publish_date, split_doc_by_date  = get_times(all_articles)
    print("publish date:",publish_date)
    fundation_by_year,fundation_by_topic = get_fundations(split_doc_by_date)

len total: 52125
actual count: 52014
publish date: {'2018': 4875, '2017': 5905, '2016': 5041, '2015': 5096, '2014': 4767, '2013': 3853, '2012': 3335, '2011': 2842, '2010': 2696, '2009': 2190, '2008': 1874, '2007': 1639, '2006': 1206, '2005': 1060, '2004': 933, '2003': 691, '2002': 619, '2001': 521, '2000': 450, '1999': 436, '1998': 396, '1997': 337, '1996': 242, '1995': 180, '1994': 185, '1993': 191, '1992': 151, '1991': 122, '1990': 25, '1989': 14, '1988': 17, '1987': 18, '1986': 9, '1985': 13, '1984': 17, '1983': 7, '1982': 11, '1981': 4, '1980': 8, '1979': 2, '1978': 7, '1977': 3, '1976': 9, '1975': 5, '1974': 4, '1973': 3, '1972': 3, '2019': 2}


In [17]:
fundation_by_year

{'2018': {'polymer': 4812,
  'chitosan': 841,
  'bone': 2651,
  'tissue engineering': 2242,
  'hydrogel': 2227,
  'adhesion': 1442,
  'regeneration': 1823,
  'nanoparticles': 2778,
  'microstructure': 790,
  'scaffold': 2546},
 '2017': {'bone': 3107,
  'tissue engineering': 2417,
  'polymer': 5930,
  'scaffold': 2951,
  'nanoparticles': 3305,
  'regeneration': 1831,
  'adhesion': 1790,
  'hydrogel': 2513,
  'chitosan': 790,
  'microstructure': 757},
 '2016': {'polymer': 4350,
  'hydrogel': 1560,
  'scaffold': 2238,
  'nanoparticles': 2770,
  'adhesion': 1266,
  'bone': 2521,
  'microstructure': 639,
  'regeneration': 1543,
  'chitosan': 664,
  'tissue engineering': 1817},
 '2015': {'polymer': 4615,
  'nanoparticles': 2199,
  'bone': 2617,
  'adhesion': 1615,
  'scaffold': 2462,
  'regeneration': 1506,
  'hydrogel': 1678,
  'chitosan': 671,
  'microstructure': 669,
  'tissue engineering': 1877},
 '2014': {'polymer': 4410,
  'bone': 2450,
  'tissue engineering': 1941,
  'hydrogel': 1411,

In [20]:
df0 = pd.DataFrame.from_dict(fundation_by_year['2000'],orient='index',columns=['2000'])
df1 = pd.DataFrame.from_dict(fundation_by_year['2001'],orient='index',columns=['2001'])
df2 = pd.DataFrame.from_dict(fundation_by_year['2002'],orient='index',columns=['2002'])
df3 = pd.DataFrame.from_dict(fundation_by_year['2003'],orient='index',columns=['2003'])
df4 = pd.DataFrame.from_dict(fundation_by_year['2004'],orient='index',columns=['2004'])
df5 = pd.DataFrame.from_dict(fundation_by_year['2005'],orient='index',columns=['2005'])
df6 = pd.DataFrame.from_dict(fundation_by_year['2006'],orient='index',columns=['2006'])
df7 = pd.DataFrame.from_dict(fundation_by_year['2007'],orient='index',columns=['2007'])
df8 = pd.DataFrame.from_dict(fundation_by_year['2008'],orient='index',columns=['2008'])
df9 = pd.DataFrame.from_dict(fundation_by_year['2009'],orient='index',columns=['2009'])
df10 = pd.DataFrame.from_dict(fundation_by_year['2010'],orient='index',columns=['2010'])
df11 = pd.DataFrame.from_dict(fundation_by_year['2011'],orient='index',columns=['2011'])
df12 = pd.DataFrame.from_dict(fundation_by_year['2012'],orient='index',columns=['2012'])
df13 = pd.DataFrame.from_dict(fundation_by_year['2013'],orient='index',columns=['2013'])
df14 = pd.DataFrame.from_dict(fundation_by_year['2014'],orient='index',columns=['2014'])
df15 = pd.DataFrame.from_dict(fundation_by_year['2015'],orient='index',columns=['2015'])
df16 = pd.DataFrame.from_dict(fundation_by_year['2016'],orient='index',columns=['2016'])
df17 = pd.DataFrame.from_dict(fundation_by_year['2017'],orient='index',columns=['2017'])
df18 = pd.DataFrame.from_dict(fundation_by_year['2018'],orient='index',columns=['2018'])

In [22]:
df1

Unnamed: 0,2001
bone,8
tissue engineering,8
scaffold,8
adhesion,2
microstructure,9


In [30]:
df = pd.merge(df0, df1, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df2, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df3, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df4, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df5, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df6, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df7, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df8, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df9, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df10, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df11, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df12, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df13, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df14, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df15, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df16, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df17, how='outer', left_index=True, right_index=True)
df = pd.merge(df, df18, how='outer', left_index=True, right_index=True)
df = df.fillna(0)
df

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
adhesion,0,2.0,0.0,0.0,0,4.0,23.0,25,200,492,793,830,942,1087,1607,1615,1266,1790,1442
bone,0,8.0,1.0,0.0,0,0.0,7.0,44,284,719,1151,1252,1496,1858,2450,2617,2521,3107,2651
chitosan,0,0.0,0.0,0.0,0,0.0,0.0,5,42,217,242,245,386,522,556,671,664,790,841
hydrogel,0,0.0,0.0,0.0,0,6.0,2.0,1,97,371,561,658,897,1099,1411,1678,1560,2513,2227
microstructure,0,9.0,0.0,0.0,0,0.0,12.0,11,64,179,215,323,444,501,622,669,639,757,790
nanoparticles,0,0.0,0.0,0.0,0,0.0,7.0,20,135,452,707,803,1198,1543,2046,2199,2770,3305,2778
polymer,0,0.0,0.0,10.0,0,10.0,17.0,48,474,1281,2072,2050,2718,3359,4410,4615,4350,5930,4812
regeneration,0,0.0,0.0,0.0,0,0.0,0.0,13,106,297,574,665,787,1031,1260,1506,1543,1831,1823
scaffold,0,8.0,0.0,0.0,0,0.0,14.0,20,279,588,993,1152,1349,1525,2468,2462,2238,2951,2546
tissue engineering,0,8.0,0.0,0.0,0,0.0,3.0,16,225,605,1012,1077,1425,1552,1941,1877,1817,2417,2242


In [31]:
df.to_csv("../data/fundations.csv",index=True)