In [1]:
import pandas as pd 
import numpy as np 
from bertopic import BERTopic 
import os
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('D:/abstract/wos_0531/bib_360446/dropna_abstract_334765.csv', engine='python')

In [3]:
year = list(df['Publication Year'])

In [4]:
import math
for i in range(len(year)):
    if math.isnan(year[i]):
        year[i]=2021
    else:
        year[i]=int(year[i])

In [5]:
import collections
collections.Counter(year)

Counter({2021: 21967,
         2020: 26163,
         2019: 25738,
         2018: 23687,
         2017: 20195,
         2016: 18589,
         2015: 17626,
         2014: 15842,
         2012: 11718,
         2011: 10900,
         2010: 10806,
         2009: 10947,
         2007: 9030,
         2005: 8074,
         2003: 7691,
         2002: 6836,
         2001: 6571,
         1999: 5694,
         1995: 3759,
         2022: 1067,
         2013: 14120,
         2008: 9933,
         2006: 7710,
         2004: 7772,
         2000: 6863,
         1997: 5425,
         1996: 4353,
         1994: 3660,
         1993: 3316,
         1992: 3219,
         1998: 5494})

In [6]:
first_decade = [0]*len(year)
for i in range(len(year)):
    if year[i]>=1992 and year[i]<=2001:
        first_decade[i]=1
    else:
        first_decade[i]=0
        
second_decade = [0]*len(year)
for i in range(len(year)):
    if year[i]>=2002 and year[i]<=2011:
        second_decade[i]=1
    else:
        second_decade[i]=0
        
third_decade = [0]*len(year)
for i in range(len(year)):
    if year[i]>=2012 and year[i]<=2022:
        third_decade[i]=1
    else:
        third_decade[i]=0

In [12]:
df_label = pd.read_csv('D:/abstract/wos_0531/bib_360446/unique_multilabel_withlabel.csv', engine='python')

In [13]:
df_label

Unnamed: 0,Abstract,topic,Research Areas,label,index
0,This paper presents an updated checklist of ma...,13,Marine & Freshwater Biology; Oceanography,Marine & Freshwater Biology,3
1,Small time behavior of gravity driven free sur...,16,Engineering; Oceanography,Engineering,2
2,This Collective Article includes records of 29...,13,Fisheries; Geology; Marine & Freshwater Biolog...,Marine & Freshwater Biology,3
3,This Collective Article presents information a...,13,Fisheries; Geology; Marine & Freshwater Biolog...,Marine & Freshwater Biology,3
4,The use of pesticides in agriculture has alway...,73,Engineering; Oceanography,Engineering,2
...,...,...,...,...,...
334760,The simplicity of the hydrochemical stationari...,40,Engineering; Geology; Water Resources,Water Resources,5
334761,In many smallholder farms in sub-Saharan Afric...,40,Geology; Meteorology & Atmospheric Sciences; W...,Water Resources,5
334762,In this study we tested whether we can use a t...,0,Environmental Sciences & Ecology; Remote Sensing,Geology,4
334763,A novel approach of using variability as a maj...,69,Meteorology & Atmospheric Sciences,Meteorology & Atmospheric Sciences,1


In [51]:
topic = list(df_label['topic'])

In [84]:
topic_growth = np.zeros([3,100])
for i in range(len(topic)):
    if first_decade[i] == 1:
        topic_index = topic[i]
        topic_growth[0,topic_index] += 1
    if second_decade[i] == 1:
        topic_index = topic[i]
        topic_growth[1,topic_index] += 1
    if third_decade[i] == 1:
        topic_index = topic[i]
        topic_growth[2,topic_index] += 1

In [85]:
topic_growth

array([[ 368., 1324.,  723., 1291., 1035., 1478., 1561.,  274.,  996.,
        1640., 1206.,   96., 1661.,  912.,  555., 1551.,  475.,  657.,
         815.,  806.,  288., 1173.,  170.,  257.,  812., 1121.,  204.,
         810.,  979.,   94.,  594.,  468.,  642.,  320.,  607.,  643.,
         109.,   96.,  529.,  465.,  136.,  729.,  125.,  654.,  883.,
         591.,  206.,  227.,  489.,  792.,  264.,  497.,  208.,  217.,
         207.,  491.,  428.,  419.,  449.,  663.,  472.,  388.,  500.,
         369.,  273.,  587.,  438.,   69.,  322.,  226.,  592.,  364.,
          65.,  248.,  540.,  466.,  389.,  273.,  119.,  331.,  133.,
         240.,   44.,  305.,  296.,   80.,  211.,  123.,  269.,  239.,
          92.,  136.,  172.,  182.,  184.,   39.,   38.,   53.,    7.,
           0.],
       [1290., 2421., 1650., 2144., 1676., 2135., 2022.,  902., 1878.,
        2038., 1983.,  489., 1618., 1454., 1471., 1554.,  934., 1196.,
        1629., 1587., 1218., 1431.,  620.,  751., 1306., 1540

In [86]:
df_topic_growth = pd.DataFrame(topic_growth, dtype=int)

In [87]:
df_topic_growth

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,368,1324,723,1291,1035,1478,1561,274,996,1640,...,92,136,172,182,184,39,38,53,7,0
1,1290,2421,1650,2144,1676,2135,2022,902,1878,2038,...,182,289,361,421,315,161,129,123,20,8
2,5460,3118,4438,2934,3595,2618,2609,4879,3111,2306,...,1142,979,833,699,632,652,501,384,461,7


In [66]:
df_topic_growth.to_csv('D:/abstract/wos_0531/bib_360446/topic_growth_number.csv', index=False)

In [88]:
topic_growth_rate = topic_growth
for i in [2,1]:
    for j in range(100):
        topic_growth_rate[i,j] = (topic_growth[i,j]-topic_growth[i-1,j])/topic_growth[i-1,j]

  topic_growth_rate[i,j] = (topic_growth[i,j]-topic_growth[i-1,j])/topic_growth[i-1,j]


In [90]:
df_topic_growth_rate = pd.DataFrame(topic_growth_rate)

In [91]:
df_topic_growth_rate = round(df_topic_growth_rate,2)

In [92]:
df_topic_growth_rate

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,368.0,1324.0,723.0,1291.0,1035.0,1478.0,1561.0,274.0,996.0,1640.0,...,92.0,136.0,172.0,182.0,184.0,39.0,38.0,53.0,7.0,0.0
1,2.51,0.83,1.28,0.66,0.62,0.44,0.3,2.29,0.89,0.24,...,0.98,1.12,1.1,1.31,0.71,3.13,2.39,1.32,1.86,inf
2,3.23,0.29,1.69,0.37,1.14,0.23,0.29,4.41,0.66,0.13,...,5.27,2.39,1.31,0.66,1.01,3.05,2.88,2.12,22.05,-0.12


In [94]:
df_topic_growth_rate.iloc[0,:]

0      368.0
1     1324.0
2      723.0
3     1291.0
4     1035.0
       ...  
95      39.0
96      38.0
97      53.0
98       7.0
99       0.0
Name: 0, Length: 100, dtype: float64

In [118]:
topic_name = pd.read_csv('D:/abstract/wos_0531/bib_360446/topic_name.csv')

topic_growth_top10 = np.zeros([3,100])
for i in range(3):
    topic_growth_rate_first = list(df_topic_growth_rate.iloc[i,:])
    sorted_topic_rate = sorted(topic_growth_rate_first,reverse=True)
    for j in range(10):
        topic_growth_top10[i,topic_growth_rate_first.index(sorted_topic_rate[j])]=1
        print(topic_name.iloc[topic_growth_rate_first.index(sorted_topic_rate[j]),0])

acoustic
zooplankton
phytoplankton
vorticity
fish
species
spawning
chlorophyll
bacterial
atlantic
mufflers
landslide
neural
groundwater
soil
water
monsoon
uncertainty
flood
energy
plastic
water
flood
neural
uncertainty
energy
pile
climate
drought
coastal


In [102]:
topic_growth_top10

array([[0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0.

In [105]:
df_topic_growth_top10 = pd.DataFrame(topic_growth_top10,dtype=int)

In [113]:
df_topic_growth_top10

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,1,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0


In [107]:
df_topic_growth_top10.to_csv('D:/abstract/wos_0531/bib_360446/topic_growth_top10.csv',index=False)

In [114]:
df_topic_growth_top10.iloc[0,:]

InvalidIndexError: (0, slice(None, None, None))