# Build the dataset

## Define filepaths

In [1]:
import os

In [2]:
root_dir = '../..'
data_dir = 'data/corpus'
src_dir = 'src'

data_filename = 'stack-overflow-data.csv'
data_filepath = os.path.join(root_dir, data_dir, data_filename)

## Load the data

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(data_filepath)
df = df[pd.notnull(df['tags'])]
df.head()

Unnamed: 0,post,tags
0,what is causing this behavior in our c# datet...,c#
1,have dynamic html load as if it was in an ifra...,asp.net
2,how to convert a float value in to min:sec i ...,objective-c
3,.net framework 4 redistributable just wonderi...,.net
4,trying to calculate and print the mean and its...,python


In [5]:
df.shape

(6013, 2)

In [6]:
df.tags.unique()

array(['c#', 'asp.net', 'objective-c', '.net', 'python', 'angularjs',
       'iphone', 'ruby-on-rails', 'ios', 'c', 'sql', 'java', 'jquery',
       'css', 'c++', 'php', 'android', 'mysql', 'javascript', 'html'],
      dtype=object)

In [7]:
df.tags.unique().shape

(20,)

## Preprocess data

In [8]:
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [9]:
# code from https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['post'] = df['post'].apply(clean_text)

In [10]:
df.head()

Unnamed: 0,post,tags
0,causing behavior c# datetime type test public ...,c#
1,dynamic html load iframe aspnet 40 site users ...,asp.net
2,convert float value minsec trying convert seco...,objective-c
3,net framework 4 redistributable wondering get ...,.net
4,trying calculate print mean returning rather n...,python


## Sample data for each tag

In [11]:
random_state = 3

In [12]:
grouped_df = df.groupby('tags')
grouped_sampled_df = grouped_df.sample(n=150, random_state=random_state)

In [13]:
sampled_df = grouped_sampled_df.reset_index(drop=True)
sampled_df.head()

Unnamed: 0,post,tags
0,truncate string net given string projects mult...,.net
1,net windows application automatically call pro...,.net
2,c# convert string unique id let assume string ...,.net
3,consolewriteline output webservice go consolew...,.net
4,net applications immune classic pointer errors...,.net


In [14]:
sampled_df.shape

(3000, 2)

## Convert the data to the `TrainingCorpus` format

In [15]:
data_dict = {}

### Define the `docs` field

In [16]:
data_dict['docs'] = sampled_df.index.tolist()

### Define the `texts` field

In [17]:
data_dict['texts'] = sampled_df['post'].tolist()

### Define the `tokens` field

In [18]:
import sys
sys.path.append(os.path.join(root_dir, src_dir))

In [19]:
from training import TrainingCorpus

In [20]:
data_dict['tokens'] = sampled_df['post']\
                        .map(lambda x: TrainingCorpus.tokenize(x)).tolist()

### Define the `labels` field

In [21]:
data_dict['labels'] = sorted(sampled_df['tags'].unique().tolist())

### Define the `target` field

In [22]:
data_dict['target'] = sampled_df['tags'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [23]:
import json

In [24]:
dataset_filename = 'stackoverflow_corpus.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [25]:
with open(dataset_filepath, 'w') as fd:
    json.dump(data_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [26]:
corpus = TrainingCorpus()
corpus.load(dataset_filepath)

In [27]:
corpus.get_text(0)

'truncate string net given string projects multiply_amada multiplyweb shared homeaspx want remove trailing characters third result projects multiply_amada would like without using split charindex'

In [28]:
corpus.get_tokens(0)

['truncate',
 'string',
 'net',
 'given',
 'string',
 'projects',
 'multiply_amada',
 'multiplyweb',
 'shared',
 'homeaspx',
 'want',
 'remove',
 'trailing',
 'characters',
 'third',
 'result',
 'projects',
 'multiply_amada',
 'would',
 'like',
 'without',
 'using',
 'split',
 'charindex']

---

## Compute noun chunks

In [29]:
chunks_filename = 'stackoverflow_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [30]:
chunks_filepath

'../../data/corpus/stackoverflow_chunks.json'

In [31]:
corpus.detect_chunks()

100%|██████████| 3000/3000 [00:56<00:00, 52.68it/s]


In [32]:
corpus.save_chunks(chunks_filepath)

---

## Load chunks

In [33]:
corpus.load_chunks(chunks_filepath)

In [34]:
list(corpus.noun_chunks.items())[:3]

[('string_projects_multiply_amada_multiplyweb', 1),
 ('characters', 10),
 ('third_result_projects', 1)]

In [35]:
corpus.get_chunk_document(0, threshold=0)

['truncate',
 'string',
 'net',
 'given',
 'string',
 'projects',
 'multiply_amada',
 'multiplyweb',
 'shared',
 'homeaspx',
 'want',
 'remove',
 'trailing',
 'characters',
 'third_result_projects',
 'multiply_amada',
 'would',
 'like',
 'without',
 'using',
 'split_charindex']

In [36]:
corpus.get_text(0)

'truncate string net given string projects multiply_amada multiplyweb shared homeaspx want remove trailing characters third result projects multiply_amada would like without using split charindex'

In [37]:
corpus.get_tokens(0)

['truncate',
 'string',
 'net',
 'given',
 'string',
 'projects',
 'multiply_amada',
 'multiplyweb',
 'shared',
 'homeaspx',
 'want',
 'remove',
 'trailing',
 'characters',
 'third',
 'result',
 'projects',
 'multiply_amada',
 'would',
 'like',
 'without',
 'using',
 'split',
 'charindex']

---