# Preprocess Data
This notebook contains materials to parse raw python files into function and docstring pairs, tokenize both function and dosctring into tokens, and split these pairs into train, valid and test set.

In [None]:
import ast
import glob
import re

import astor
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from tqdm import tqdm

EN = spacy.load('en')

## Download raw python files

## Read raw python files

In [None]:
files = glob.glob('python_files_*.json.gz')
dfs = [pd.read_json(f) for f in tqdm(files)]
df = pd.concat(dfs)
df.columns = ['nwo', 'path', 'content']
df.head()

In [None]:
df.shape

## Functions to parse and generate pairs and tokenize

In [None]:
def tokenize_docstring(text):
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]


def tokenize_code(text):
    return RegexpTokenizer(r'\w+').tokenize(text)


def get_function_docstring_pairs(blob):
    pairs = []
    try:
        module = ast.parse(blob)
        classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
        functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
        for _class in classes:
            functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])

        for f in functions:
            source = astor.to_source(f)
            docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
            function = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source

            pairs.append((f.name,
                          f.lineno,
                          source,
                          ' '.join(tokenize_code(function)),
                          ' '.join(tokenize_docstring(docstring.split('\n\n')[0]))
                         ))
    except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
        pass
    return pairs

In [None]:
df['pairs'] = df['content'].apply(get_function_docstring_pairs)

## Process dataframe to get lineage of urls and pairs

In [None]:
# flatten pairs
df = df.set_index(['nwo', 'path'])['pairs'].apply(pd.Series).stack()
df = df.reset_index()
df.columns = ['nwo', 'path', '_', 'pair']

In [None]:
df['function_name'] = df['pair'].apply(lambda p: p[0])
df['lineno'] = df['pair'].apply(lambda p: p[1])
df['original_function'] = df['pair'].apply(lambda p: p[2])
df['function_tokens'] = df['pair'].apply(lambda p: p[3])
df['docstring_tokens'] = df['pair'].apply(lambda p: p[4])
df = df[['nwo', 'path', 'function_name', 'lineno', 'original_function', 'function_tokens', 'docstring_tokens']]
df['url'] = df[['nwo', 'path', 'lineno']].apply(lambda x: 'https://github.com/{}/blob/master/{}#L{}'.format(x[0], x[1], x[2]), axis=1)
df.head()

## Dedupe

In [None]:
# dedupe exactly same function
df = df.drop_duplicates(['original_function', 'function_tokens'])

## Separate function w/o docstrings

In [None]:
# separate functions w/o docstrings
with_docstrings = df[df['docstring_tokens'] != '']
without_docstrings = df[df['docstring_tokens'] == '']

## Make sure that code from each repository only gets into one set
Rough assumption that each repository has its own style.

In [None]:
grouped = with_docstrings.groupby('nwo')

In [None]:
# train, valid, test splits
train, test = train_test_split(list(grouped), train_size=0.99, shuffle=True)
train, valid = train_test_split(train, train_size=0.95)

In [None]:
train = pd.concat([d for _, d in train]).reset_index(drop=True)
valid = pd.concat([d for _, d in valid]).reset_index(drop=True)
test = pd.concat([d for _, d in test]).reset_index(drop=True)
train.head()

## Output each set to train/valid/test.function/docstrings/lineage files
Original functions are also written to compressed json files. (Raw functions contain `,`, `\t`, `\n`, etc., it is less error-prone using json format)

In [None]:
def write_to(df, filename):
    df.function_tokens.to_csv('{}.function'.format(filename), index=False)
    df.original_function.to_json('{}_original_function.json.gz'.format(filename), orient='values', compression='gzip')
    if filename != 'without_docstrings':
        df.docstring_tokens.to_csv('{}.docstring'.format(filename), index=False)
    df.url.to_csv('{}.lineage'.format(filename), index=False)

In [None]:
# write to output files
write_to(train, 'train')
write_to(valid, 'valid')
write_to(test, 'test')
write_to(without_docstrings, 'without_docstrings')