### Import everything
reference: https://towardsdatascience.com/a-complete-exploratory-data-analysis-and-visualization-for-text-data-29fb1b96fb6a

In [1]:
import pandas as pd
import numpy as np
import sentencepiece as spm
import os
import collections
from itertools import islice
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

### Load the pickled dataframes
Change the `LANGUAGE` variable based on the language for the exploratory analysis

In [2]:
LANGUAGE_PATH='../../data/dataframes'
LANGUAGE = 'cpp'
path = os.path.join(LANGUAGE_PATH, LANGUAGE)
train_path = os.path.join(path, str(LANGUAGE + '_' +'train.pkl'))
test_path = os.path.join(path, str(LANGUAGE + '_' +'test.pkl'))
validation_path = os.path.join(path, str(LANGUAGE + '_' +'validate.pkl'))

train_df = pd.read_pickle(train_path, compression='gzip')
test_df = pd.read_pickle(test_path, compression='gzip')
validate_df = pd.read_pickle(validation_path, compression='gzip')

FileNotFoundError: [Errno 2] No such file or directory: '../../data/dataframes/cpp/cpp_train.pkl'

### Add new features
Add the length of each file in characters and in number of words. Number of words is split on whitespace characters. #TODO is this a valid assumption? 

In [24]:
train_df['code_len'] = train_df['file_contents'].astype(str).apply(len)
test_df['code_len'] = test_df['file_contents'].astype(str).apply(len)
validate_df['code_len'] = validate_df['file_contents'].astype(str).apply(len)

In [25]:
train_df['word_count'] = train_df['file_contents'].apply(lambda x: len(str(x).split()))
test_df['word_count'] = test_df['file_contents'].apply(lambda x: len(str(x).split()))
validate_df['word_count'] = validate_df['file_contents'].apply(lambda x: len(str(x).split()))

### Plot distribution of lengths in words and characters

In [26]:
train_df['code_len'].iplot(
    kind='box',
    xTitle='Characters',
    linecolor='black',
    yTitle='count',
    title='Character Count Distribution')

In [27]:
test_df['code_len'].iplot(
    kind='box',
    xTitle='Characters',
    linecolor='black',
    yTitle='count',
    title='Character Count Distribution')

In [28]:
validate_df['code_len'].iplot(
    kind='box',
    xTitle='Characters',
    linecolor='black',
    yTitle='count',
    title='Character Count Distribution')

In [29]:
train_df['word_count'].iplot(
    kind='box',
    xTitle='Words',
    linecolor='black',
    yTitle='count',
    title='Word Count Distribution')

In [30]:
test_df['word_count'].iplot(
    kind='box',
    xTitle='Words',
    linecolor='black',
    yTitle='count',
    title='Word Count Distribution')

In [31]:
validate_df['word_count'].iplot(
    kind='box',
    xTitle='Words',
    linecolor='black',
    yTitle='count',
    title='Word Count Distribution')

### Get top most common tokens
These tokens are split on spaces, which may not be a reasonable assumption; #TODO Ask David

In [34]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [35]:
top_tokens = get_top_n_words(train_df['file_contents'], 20)
for token, freq in top_tokens:
    print(token, freq)

if 497765
the 372218
return 323472
double 262832
const 224779
int 221002
this 170313
of 142401
void 139128
for 123549
to 121337
is 105787
static 104813
else 101919
in 96963
include 95466
0x0 86624
and 79726
false 76155
case 71411


In [36]:
df1 = pd.DataFrame(top_tokens, columns = ['file_contents' , 'count'])
df1.groupby('file_contents').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review before removing stop words')

### Bigrams and Trigrams
I don't know if we care but here it is #TODO: Do we care?

In [24]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [25]:
top_bigrams = get_top_n_bigram(train_df['file_contents'], 20)
for token, freq in top_bigrams:
    print(token, freq)

top_trigrams = get_top_n_trigram(train_df['file_contents'], 20)
for token, freq in top_trigrams:
    print(token, freq) 

double double 79189
static double 74592
double return 74396
of the 54364
0x0 0x0 50225
else if 35861
0x0000 0x0000 34470
break case 29765
in the 28585
this file 25999
true_ true_ 25491
const char 24870
return false 23568
to the 18830
for int 18465
general public 17291
return true 16745
const qstring 16661
of type 15455
const return 15417
double double return 74251
0x0 0x0 0x0 32400
0x0000 0x0000 0x0000 27226
true_ true_ true_ 25487
gnu general public 15413
general public license 14678
the gnu general 12796
argument of type 11515
of the gnu 10921
0x00 0x00 0x00 9947
this file is 9850
return static double 9667
double return static 9662
the terms of 9368
under the terms 9314
oo oo oo 8751
free software foundation 8554
the free software 8537
fitness for particular 7794
for particular purpose 7794


### BPE with SentencePiece
Step one is constructing one large training corpus from the training dataset

In [9]:
def construct_massive_file(path, df):
    fileAll = "fileAll" + LANGUAGE + ".txt"
    path = os.path.join(path, fileAll)
    with open(fileAll, "w+", encoding='UTF-8') as fp:
        for file in df.file_contents:
            fp.write(file+"\n")

In [12]:
COMBINED_PATH='../../data/combined_files'
construct_massive_file(COMBINED_PATH, train_df)

/Users/megretson/Projects/SE/Artificial-Code-Gen/main/src/preparation


#### Train the encoder

In [None]:
spm.SentencePieceTrainer.Train(f'--input=../../data/combined_files/fileAllcpp.txt \
                               --model_prefix={LANGUAGE} --vocab_size=1000 --model_type=bpe')

In [3]:
sp = spm.SentencePieceProcessor()
sp.Load("m.model")
freq = {}

#### Get the top tokens

In [8]:
random_subset = train_df.sample(n=1000)

for file in random_subset.file_contents:
    encoding = sp.encode_as_pieces(file)
    for piece in encoding:
        freq.setdefault(piece, 0)
        freq[piece] += 1

sorted_freq = sorted(freq.items() ,  key=lambda x: x[1])

In [12]:
sorted_freq_de = sorted(freq.items(), key=lambda x: x[1], reverse=True)
sorted_dict_in = collections.OrderedDict(sorted_freq)
sorted_dict_de = collections.OrderedDict(sorted_freq_de)

In [15]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

most_freq_20 = take(20, sorted_dict_de.items())
least_freq_20 = take(20, sorted_dict_in.items())

In [21]:
for piece in most_freq_20:
    print(sp.decode_pieces([piece[0]]))

_
(
,
s
p
.
=
)
;
{
}
(
->
c
x
);
f
m
C



In [22]:
for piece in least_freq_20:
    print(sp.decode_pieces([piece[0]]))

ベクトル
行列
を表示
テスト用のクラス
コンストラクタ
デストラクタ
プロトタイプ宣言
インスタンスを作ってもらう
から戻った
からメンバをアクセス
工場
を呼び出した
インスタンスを生成
生成したインスタンスを返す
Ȥ
Υ
饹
Υƥ
̾
֤


In [38]:
df1 = pd.DataFrame(most_freq_20, columns = ['token' , 'count'])
df1.groupby('token').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 tokens after BPE encoding')

In [39]:
df2 = pd.DataFrame(least_freq_20, columns = ['token' , 'count'])
df2.groupby('token').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 tokens after BPE encoding')