In [107]:
import pandas as pd
import numpy as np
from wordcloud import STOPWORDS
from collections import Counter
import re


In [250]:
def get_data(data):
    data = data[data['WoS Categories'].notna()].reset_index(drop=True)
    data_1 = data[["Publication Year", "WoS Categories"]]
    data_2 = pd.DataFrame(data_1.groupby("Publication Year")[
        "WoS Categories"].apply(lambda x: x.str.cat(sep=' ')))
    return(data_2)


In [251]:
Alex = pd.read_csv("AlexNet.csv")
Google = pd.read_csv("GoogleNet.csv")
Le = pd.read_csv("LeNet.csv")
Res = pd.read_csv("ResNet.csv")
Se = pd.read_csv("SeNet.csv")
VGG = pd.read_csv("VGG.csv")
Alex = Alex[Alex["Publication Year"] >= 2000]
Google = Google[Google["Publication Year"] >= 2000]
Le = Le[Le["Publication Year"] >= 2000]
Res = Res[Res["Publication Year"] >= 2000]
Se = Se[Se["Publication Year"] >= 2000]
VGG = VGG[VGG["Publication Year"] >= 2000]


In [252]:
total_data = get_data(pd.concat([Alex,Google,Le,Res,Se,VGG]).reset_index())
Alex_data = get_data(Alex)
Google_data = get_data(Google)
Le_data = get_data(Le)
Res_data = get_data(Res)
Se_data = get_data(Se)
VGG_data = get_data(VGG)
total_data.head()

Unnamed: 0_level_0,WoS Categories
Publication Year,Unnamed: 1_level_1
2000.0,"Dentistry, Oral Surgery & Medicine Computer Sc..."
2001.0,"Anthropology; Archaeology Chemistry, Multidisc..."
2002.0,Biology Biodiversity Conservation; Ecology Phy...
2003.0,"Materials Science, Multidisciplinary; Physics,..."
2004.0,"Chemistry, Medicinal; Chemistry, Multidiscipli..."


In [253]:
#strip stopwords
def del_stop(text):
    stopwords = list(STOPWORDS)
    stopwords.extend(["using", "Using", "A", "THE","An","B","&","Science,"])
    s = []
    for i in range(len(text)):
        if text[i][0] not in stopwords:
            s.append(text[i])
    res = [list(s[i]) for i in range(len(s)) if not any(
        s[i][0][j].isdigit() for j in range(len(s[i][0])))]
    for i in range(len(res)):
        res[i][0] = re.split(",|;",res[i][0])[0]
    return res

#get frequent words each year
def get_freq_year(year,data):
    line_text = data.loc[year, "WoS Categories"]
    freq = Counter(line_text.split()).most_common()
    data_res = del_stop(freq)
    return data_res
def get_words(data):
    data_res = []
    for i in list(data.index):
        data_res.append(get_freq_year(i,data))
    return data_res


In [287]:
from tkinter.tix import TCL_DONT_WAIT
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
from pyecharts.charts import Bar, Timeline
def get_cloud(input_data):
    tl = Timeline()
    words = get_words(input_data)
    j = 0
    for i in list(input_data.index):
        word = words[j]
        count = [word[i][1] for i in range(len(word))]
        vlow = 0.2*max(count)
        low = 0.4*max(count)
        mid = 0.6*max(count)
        high = 0.8*max(count)
        top = max(count)
        c = (
        WordCloud(init_opts=opts.InitOpts(width = "900",
        height = "800"))
        #    init_opts=opts.InitOpts(
        #     theme='white', bg_color='rgba(black, 2.0)'))
        .add("", data_pair = words[int(j)], 
            word_size_range=[20, 100], 
            word_gap=10, 
            rotate_step=30
            )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                                title="Keywords Transition", 
                                title_textstyle_opts=opts.TextStyleOpts(color='white')),
            visualmap_opts=opts.VisualMapOpts(
            textstyle_opts=opts.TextStyleOpts(color='white'),
            is_piecewise=True,
            pieces=[
                {"max": top, "min": high,
                    "label": '%d - %d' % (high, top), "color": "#4d3077"},
                {"max": high, "min": mid,
                    "label": '%d - %d' % (mid, high), "color": "#663fa2"},
                {"max": mid, "min": low,
                    "label": '%d - %d' % (low, mid), "color": "#9a5cff"},
                {"max": low, "min": vlow,
                    "label": '%d - %d' % (vlow, low), "color": "#c293ff"},
                {"max": vlow, "min": 0,
                    "label": '%d - %d' % (0, vlow), "color": "#e3c9ff"},
            ])
            )
        )
        tl.add(c, "{}".format(int(i))).add_schema(label_opts=opts.series_options.LabelOpts(color='white'))
        j = j+1
    return tl

In [288]:
from pyecharts.charts import Tab
total_cloud = get_cloud(total_data)
Alex_cloud = get_cloud(Alex_data)
Google_cloud = get_cloud(Google_data)
Le_cloud = get_cloud(Le_data)
Res_cloud = get_cloud(Res_data)
Se_cloud = get_cloud(Se_data)
VGG_cloud = get_cloud(VGG_data)

tab = Tab()

tab.add(total_cloud, "All")
tab.add(Alex_cloud, "AlexNet")
tab.add(Google_cloud, "GoogleNet")
tab.add(Le_cloud, "LeNet")
tab.add(Res_cloud, "ResNet")
tab.add(Se_cloud, "SeNet")
tab.add(VGG_cloud, "VGG")
tab.render("word_cloud.html")
tab.render_notebook()