# Build the dataset

## Define filepaths

In [1]:
import os

In [2]:
root_dir = '../..'
data_dir = 'data/corpus'
dataset_dir = 'papers'
src_dir = 'src'

input_filename = 'data_input.csv'
input_filepath = os.path.join(root_dir, data_dir, dataset_dir, input_filename)

output_filename = 'data_output.csv'
output_filepath = os.path.join(root_dir, data_dir, dataset_dir, output_filename)

## Load the data

In [3]:
import pandas as pd

In [4]:
input_df = pd.read_csv(input_filepath)
input_df = input_df.drop(columns='id')
# lowercase texts
input_df['abstract'] = input_df['abstract'].map(lambda x: x.lower())
input_df.head()

Unnamed: 0,abstract
0,"turing machines and g\""odel numbers are import..."
1,rna-sequencing has revolutionized biomedical r...
2,queuing models provide insight into the tempor...
3,"in a multiple-object auction, every bidder tri..."
4,in arxiv:1109.6438v1 [math.ag] we introduced a...


In [5]:
input_df.shape

(86209, 1)

In [6]:
output_df = pd.read_csv(output_filepath)
output_df = output_df.drop(columns='id')
output_df.head()

Unnamed: 0,category
0,cs
1,stat
2,physics
3,cs
4,math


In [7]:
output_df.shape

(86209, 1)

Merge both Dataframes

In [8]:
df = pd.concat([input_df, output_df], axis=1)
df.head()

Unnamed: 0,abstract,category
0,"turing machines and g\""odel numbers are import...",cs
1,rna-sequencing has revolutionized biomedical r...,stat
2,queuing models provide insight into the tempor...,physics
3,"in a multiple-object auction, every bidder tri...",cs
4,in arxiv:1109.6438v1 [math.ag] we introduced a...,math


## Sample data for each category

In [9]:
random_state = 3

In [10]:
grouped_df = df.groupby('category')
grouped_sampled_df = grouped_df.sample(n=500, random_state=random_state)

In [11]:
sampled_df = grouped_sampled_df.reset_index(drop=True)
sampled_df.head()

Unnamed: 0,abstract,category
0,"in this work, having in mind the construction ...",cs
1,optimal selection of interdependent it project...,cs
2,sorting is one of the classic problems of comp...,cs
3,dependency analysis is a technique to identify...,cs
4,the module theorem by janhunen et al. demonstr...,cs


## Convert the data to the `TrainingCorpus` format

In [12]:
data_dict = {}

### Define the `docs` field

In [13]:
data_dict['docs'] = sampled_df.index.tolist()

### Define the `texts` field

In [14]:
data_dict['texts'] = sampled_df['abstract'].tolist()

### Define the `tokens` field

In [15]:
import sys
sys.path.append(os.path.join(root_dir, src_dir))

In [16]:
from training import TrainingCorpus

In [17]:
data_dict['tokens'] = sampled_df['abstract']\
                        .map(lambda x: TrainingCorpus.tokenize(x)).tolist()

### Define the `labels` field

In [18]:
data_dict['labels'] = sorted(sampled_df['category'].unique().tolist())

### Define the `target` field

In [19]:
data_dict['target'] = sampled_df['category'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [20]:
import json

In [21]:
dataset_filename = 'abstracts_corpus.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [22]:
with open(dataset_filepath, 'w') as fd:
    json.dump(data_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [23]:
corpus = TrainingCorpus()
corpus.load(dataset_filepath)

In [24]:
corpus.get_text(0)

'in this work, having in mind the construction of concurrent systems from components, we discuss the difference between actions and events. for this discussion, we propose an(other) architecture description language in which actions and events are made explicit in the description of a component and a system. our work builds from the ideas set forth by the categorical approach to the construction of software based systems from components advocated by goguen and burstall, in the context of institutions, and by fiadeiro and maibaum, in the context of temporal logic. in this context, we formalize a notion of a component as an element of an indexed category and we elicit a notion of a morphism between components as morphisms of this category. moreover, we elaborate on how this formalization captures, in a convenient manner, the underlying structure of a component and the basic interaction mechanisms for putting components together. further, we advance some ideas on how certain matters relat

In [25]:
corpus.get_tokens(0)

['work',
 'mind',
 'construction',
 'concurrent',
 'systems',
 'components',
 'discuss',
 'difference',
 'actions',
 'events',
 'discussion',
 'propose',
 'architecture',
 'description',
 'language',
 'actions',
 'events',
 'made',
 'explicit',
 'description',
 'component',
 'system',
 'work',
 'builds',
 'ideas',
 'set',
 'forth',
 'categorical',
 'approach',
 'construction',
 'software',
 'based',
 'systems',
 'components',
 'advocated',
 'goguen',
 'burstall',
 'context',
 'institutions',
 'fiadeiro',
 'maibaum',
 'context',
 'temporal',
 'logic',
 'context',
 'formalize',
 'notion',
 'component',
 'element',
 'indexed',
 'category',
 'elicit',
 'notion',
 'morphism',
 'components',
 'morphisms',
 'category',
 'moreover',
 'elaborate',
 'formalization',
 'captures',
 'convenient',
 'manner',
 'underlying',
 'structure',
 'component',
 'basic',
 'interaction',
 'mechanisms',
 'putting',
 'components',
 'together',
 'advance',
 'ideas',
 'certain',
 'matters',
 'related',
 'openness',

---

## Compute noun chunks

In [26]:
chunks_filename = 'abstracts_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [27]:
chunks_filepath

'../../data/corpus/abstracts_chunks.json'

In [28]:
corpus.detect_chunks()

100%|██████████| 2000/2000 [00:48<00:00, 40.94it/s]


In [29]:
corpus.save_chunks(chunks_filepath)

---

## Load chunks

In [30]:
corpus.load_chunks(chunks_filepath)

In [31]:
list(corpus.noun_chunks.items())[:3]

[('work', 113), ('mind', 3), ('construction', 47)]

In [32]:
corpus.get_chunk_document(0, threshold=0)

['work',
 'mind',
 'construction',
 'concurrent_systems',
 'components',
 'discuss',
 'difference',
 'actions',
 'events',
 'discussion',
 'propose',
 'architecture_description_language',
 'actions',
 'events',
 'made',
 'explicit',
 'description',
 'component',
 'system',
 'work',
 'builds',
 'ideas',
 'set',
 'forth',
 'categorical_approach',
 'construction',
 'software_based_systems',
 'components',
 'advocated',
 'goguen',
 'burstall',
 'context',
 'institutions',
 'fiadeiro',
 'maibaum',
 'context',
 'temporal_logic',
 'context',
 'formalize',
 'notion',
 'component',
 'element',
 'indexed_category',
 'elicit',
 'notion',
 'morphism',
 'components',
 'morphisms',
 'category',
 'moreover',
 'elaborate',
 'formalization',
 'captures',
 'convenient_manner',
 'underlying_structure',
 'component',
 'basic_interaction_mechanisms',
 'putting',
 'components',
 'together',
 'advance',
 'ideas',
 'certain_matters',
 'related',
 'openness',
 'compositionality',
 'component/system',
 'may',
 

In [33]:
corpus.get_text(0)

'in this work, having in mind the construction of concurrent systems from components, we discuss the difference between actions and events. for this discussion, we propose an(other) architecture description language in which actions and events are made explicit in the description of a component and a system. our work builds from the ideas set forth by the categorical approach to the construction of software based systems from components advocated by goguen and burstall, in the context of institutions, and by fiadeiro and maibaum, in the context of temporal logic. in this context, we formalize a notion of a component as an element of an indexed category and we elicit a notion of a morphism between components as morphisms of this category. moreover, we elaborate on how this formalization captures, in a convenient manner, the underlying structure of a component and the basic interaction mechanisms for putting components together. further, we advance some ideas on how certain matters relat

In [34]:
corpus.get_tokens(0)

['work',
 'mind',
 'construction',
 'concurrent',
 'systems',
 'components',
 'discuss',
 'difference',
 'actions',
 'events',
 'discussion',
 'propose',
 'architecture',
 'description',
 'language',
 'actions',
 'events',
 'made',
 'explicit',
 'description',
 'component',
 'system',
 'work',
 'builds',
 'ideas',
 'set',
 'forth',
 'categorical',
 'approach',
 'construction',
 'software',
 'based',
 'systems',
 'components',
 'advocated',
 'goguen',
 'burstall',
 'context',
 'institutions',
 'fiadeiro',
 'maibaum',
 'context',
 'temporal',
 'logic',
 'context',
 'formalize',
 'notion',
 'component',
 'element',
 'indexed',
 'category',
 'elicit',
 'notion',
 'morphism',
 'components',
 'morphisms',
 'category',
 'moreover',
 'elaborate',
 'formalization',
 'captures',
 'convenient',
 'manner',
 'underlying',
 'structure',
 'component',
 'basic',
 'interaction',
 'mechanisms',
 'putting',
 'components',
 'together',
 'advance',
 'ideas',
 'certain',
 'matters',
 'related',
 'openness',

---