In [1]:
import json
import pandas as pd

paper_filename = '../data/paper.csv'
scholar_filename = '../data/scholar.csv'
output_statistic_file = '../data/statistic.json'

In [2]:
# read data from csv file
df_paper = pd.read_csv(paper_filename, sep=',', header=0)
df_paper = df_paper.sort_values(['year', 'booktitle', 'title'], ascending=False)
display(df_paper)

Unnamed: 0,year,type,author,title,field,tag,booktitle,abbr,vol,no,pages,doi
0,2025,article,"Eduard Baranov, Axel Legay",Baital: Sampling configurable systems with hig...,Generation,,Science of Computer Programming,,240,,103209,10.1016/J.SCICO.2024.103209
1,2025,article,"Andrea Bombarda, Angelo Gargantini",On the Completion of Partial Combinatorial Tes...,Generation,,SN Computer Science,SNCS,6,4,383,10.1007/S42979-025-03937-Y
2,2025,article,"Pierre Martou, Benoît Duhoux, Kim Mens, Axel L...",Combinatorial transition testing in dynamicall...,Application,,Journal of Systems and Software,JSS,221,,112260,10.1016/J.JSS.2024.112260
3,2025,article,"Kambiz Nezami Balouchi, Julien Mercier, Robert...",An exploratory empirical eye-tracker study of ...,Application,,Journal of Systems and Software,JSS,220,,112261,10.1016/J.JSS.2024.112261
4,2025,article,"Yan Wang, Xintao Niu, Huayao Wu, Changhai Nie,...",Top-down: A better strategy for incremental co...,Generation,,Information and Software Technology,IST,178,,107601,10.1016/J.INFSOF.2024.107601
...,...,...,...,...,...,...,...,...,...,...,...,...
1090,1989,article,"Charles J. Colbourn, Paul C. van Oorschot",Applications of combinatorial designs in compu...,Application,Survey,ACM Computing Surveys,CSUR,21,2,223-250,10.1145/66443.66446
1091,1988,article,"Gadiel Seroussi, Nader H. Bshouty",Vector Sets for Exhaustive Testing of Logic Ci...,Application,,IEEE Transaction on Information Theory,,34,3,513-522,10.1109/18.6031
1092,1987,inproceedings,Keizo Tatsumi,Test Case Design Support System,Application,Constraint,International Conference on Quality Control,,,,615-620,
1093,1987,inproceedings,"Keizo Tatsumi, S. Watanabe, Y. Takeuchi, H. Sh...",Conceptual support for test case design,Application,Constraint,"International Computers, Software & Applicatio...",COMPSAC,,,285-290,


In [3]:
# number of publications
publication_data = df_paper.groupby('year').size().to_frame('number')
publication_data['cumulative'] = publication_data['number'].cumsum()
publication_data = publication_data.reset_index()
display(publication_data)

Unnamed: 0,year,number,cumulative
0,1985,1,1
1,1987,2,3
2,1988,1,4
3,1989,1,5
4,1992,1,6
5,1994,3,9
6,1995,1,10
7,1996,3,13
8,1997,2,15
9,1998,6,21


In [4]:
# distribution of topics
distribution_data = df_paper.groupby('field').size().to_frame('count').reset_index()
distribution_data = distribution_data.sort_values('count', ascending=False)
distribution_data

Unnamed: 0,field,count
3,Generation,465
0,Application,344
2,Evaluation,88
5,Optimization,70
4,Model,52
1,Diagnosis,46
6,Other,28


In [5]:
# number of publication of each topic per year
year = publication_data['year'].values.tolist()
field = distribution_data['field'].values.tolist()

# create a new data frame for all combinations of <year, field>
full_index = pd.MultiIndex.from_product([year, field], names=['year', 'field'])
full_df = pd.DataFrame(index=full_index).reset_index()

# calculate number of papers for each combiantion of <year, field>
temp = df_paper.groupby(['year', 'field']).size().to_frame('number').reset_index()

# merge
topic_data = full_df.merge(temp, on=['year', 'field'], how='left').fillna(0)
topic_data['number'] = topic_data['number'].astype(int) 

display(topic_data)

Unnamed: 0,year,field,number
0,1985,Generation,0
1,1985,Application,1
2,1985,Evaluation,0
3,1985,Optimization,0
4,1985,Model,0
...,...,...,...
254,2025,Evaluation,1
255,2025,Optimization,1
256,2025,Model,1
257,2025,Diagnosis,0


### Final Data

Index page:
- `cumulative-2000`: cumulative number of publications, from 2000 to present (bar chart)
- `cumulative`: cumulative number of publications (bar chart)
- `distribution`: distribution of topics (pie chart)

Statictics chart page
- `annual`: annual number of publications (line chart)
- `topic`: annual number of publications per topic (bar chart, stacked)

In [6]:
data = {
    'cumulative-2000': {},   # {year = [], value = []}
    'cumulative': {},        # {year = [], value = []}
    'annual': {},       # {year = [], value = []}
    'distribution': {}, # {field = [], count = []}
    'topic': {}         # {field_1 = [], field_2 = [], ...}       
}

year = publication_data['year'].values.tolist()
number = publication_data['number'].values.tolist()
cumulative = publication_data['cumulative'].values.tolist()

# cumulative data, from 2000 to now
index = year.index(2000)
data['cumulative-2000']['year'] = year[index:]
data['cumulative-2000']['value'] = cumulative[index:]

# cumulative data
data['cumulative']['year'] = year
data['cumulative']['value'] = cumulative

# annual data
data['annual']['year'] = year
data['annual']['value'] = number

# distribution data
fields = distribution_data['field'].values.tolist()
data['distribution']['fields'] = fields
data['distribution']['count'] = distribution_data['count'].values.tolist()

# topic data
for field in fields:
    subset = topic_data[topic_data['field'] == field]
    data['topic'][field] = subset['number'].tolist()
    assert(len(data['topic'][field]) == len(year))

In [7]:
# write data to json file
with open(output_statistic_file, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)