In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from ktext.preprocess import processor
import dill as dpickle
import numpy as np
import h5py
import json

In [7]:
df = pd.concat([pd.read_csv(f'https://storage.googleapis.com/codenet/issue_labels/00000000000{i}.csv.gz')
                for i in range(1)])

#split data into train/test
traindf, testdf = train_test_split(df, test_size=.15, random_state=0)

In [11]:
# Descriptive Statistics
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')
traindf.groupby('class_int').size()
print(f' Avg # of issues per repo: {len(traindf) / traindf.repo.nunique():.1f}')
print(f" Avg # of issues per org: {len(traindf) / traindf.repo.apply(lambda x: x.split('/')[-1]).nunique():.1f}")
pareto_df = pd.DataFrame({'pcnt': df.groupby('repo').size() / len(df), 'count': df.groupby('repo').size()})
print(pareto_df.sort_values('pcnt', ascending=False).head(20))

Train: 270,624 rows 10 columns
Test: 47,758 rows 10 columns
 Avg # of issues per repo: 2.6
 Avg # of issues per org: 2.8
                                pcnt  count
repo                                       
Microsoft/vscode            0.005145   1638
rancher/rancher             0.002349    748
MicrosoftDocs/azure-docs    0.002060    656
godotengine/godot           0.001894    603
ansible/ansible             0.001866    594
hashicorp/terraform         0.001624    517
kubernetes/kubernetes       0.001504    479
lionheart/openradar-mirror  0.001432    456
dart-lang/sdk               0.001159    369
elastic/kibana              0.001156    368
eclipse/che                 0.001150    366
dotnet/corefx               0.001146    365
magento/magento2            0.001040    331
brave/browser-laptop        0.001027    327
kbower/tickettest1          0.000974    310
Kademi/kademi-dev           0.000832    265
eslint/eslint               0.000801    255
owncloud/core               0.000782    249

In [16]:
# Clean, tokenize, and apply padding / truncating such that each document length = 75th percentile for the dataset.
#  also, retain only the top keep_n words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words 

train_body_raw = traindf.body.tolist()
train_title_raw = traindf.title.tolist()

# process the issue body data
body_pp = processor(.75, keep_n=8000)
train_body_vecs = body_pp.fit_transform(train_body_raw)

# process the title data
title_pp = processor(.75, keep_n=4500)
train_title_vecs = title_pp.fit_transform(train_title_raw)

 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.


In [29]:
# apply transformations to test data
test_body_raw = testdf.body.tolist()
test_title_raw = testdf.title.tolist()

test_body_vecs = body_pp.transform_parallel(test_body_raw)
test_title_vecs = title_pp.transform_parallel(test_title_raw)


# extract labels
train_labels = np.expand_dims(traindf.class_int.values, -1)
test_labels = np.expand_dims(testdf.class_int.values, -1)
num_classes = len(set(train_labels[:, 0]))

# Check shapes
# the number of rows in data for the body, title and labels should be the same for both train and test partitions
assert train_body_vecs.shape[0] == train_title_vecs.shape[0] == train_labels.shape[0]
assert test_body_vecs.shape[0] == test_title_vecs.shape[0] == test_labels.shape[0]
assert num_classes == 3



In [38]:
f = h5py.File('data/dataset.hdf5', 'w')
f.create_dataset('/titles', data=train_title_vecs)
f.create_dataset('/bodies', data=train_body_vecs)
f.create_dataset('/targets', data=train_labels)

f.create_dataset('/test_titles', data=test_title_vecs)
f.create_dataset('/test_bodies', data=test_body_vecs)
f.create_dataset('/test_targets', data=test_labels)
f.close()


with open("data/metadata.json", "w") as f:
    meta = {
        'body_vocab_size': body_pp.n_tokens,
        'title_vocab_size': title_pp.n_tokens,
        'issue_body_doc_length': train_body_vecs.shape[1],
        'issue_title_doc_length': train_title_vecs.shape[1],
        'num_classes': num_classes,
    }
    f.write(json.dumps(meta))
    
# Save the preprocessor
with open('data/body_pp.dpkl', 'wb') as f:
    dpickle.dump(body_pp, f)

with open('data/title_pp.dpkl', 'wb') as f:
    dpickle.dump(title_pp, f)

In [42]:
!ls -lah data/

total 360960
drwxr-xr-x   6 khulnasoft  staff   192B Oct  9 11:59 [1m[36m.[m[m
drwxr-xr-x  11 khulnasoft  staff   352B Oct  9 12:02 [1m[36m..[m[m
-rw-r--r--   1 khulnasoft  staff    35M Oct  9 12:02 body_pp.dpkl
-rw-r--r--   1 khulnasoft  staff   124M Oct  9 12:02 dataset.hdf5
-rw-r--r--   1 khulnasoft  staff   128B Oct  9 12:02 metadata.json
-rw-r--r--   1 khulnasoft  staff   4.1M Oct  9 12:02 title_pp.dpkl
