# Exploration of the Data Provided by Zhang et. al (2019)

## Source code classification (Open Judge)

In [97]:
## Import libraries

import pandas as pd
import numpy as np

In [6]:
mydf = pd.read_pickle('astnn/data/programs.pkl')

In [3]:
mydf.head()

Unnamed: 0,0,1,2
0,0,"int main()\n{\n\tint a;\n\tint bai,wushi,ershi...",97
1,1,"int main()\n{\n int m,x100,x50,x20,x10,x5,x...",97
2,2,"int main()\n{\n int n,i,shuzu[111],count1=0...",97
3,3,"int main()\n{\n\tint n,a1=0,a2=0,a3=0,a4=0,a5=...",97
4,4,"int main()\n{\n\tint n,a,b,c,d,e,f;\n\ta=0;b=0...",97


In [4]:
type(mydf.iloc[:,1])

pandas.core.series.Series

In [5]:
mydf.iloc[:,0]

0            0
1            1
2            2
3            3
4            4
         ...  
51996    51996
51997    51997
51998    51998
51999    51999
52000    52000
Name: 0, Length: 52001, dtype: int64

Check the structure of single data from a single row

In [6]:
mydf.iloc[0,1]

'int main()\n{\n\tint a;\n\tint bai,wushi,ershi,shi,wu,yi;\n\tcin>>a;\n\tbai=a/100;\n\ta=a%100;\n\twushi=a/50;\n\ta=a%50;\n\tershi=a/20;\n\ta=a%20;\n\tshi=a/10;\n\ta=a%10;\n\twu=a/5;\n\ta=a%5;\n\tyi=a;\n\tcout<<bai<<endl;\n\tcout<<wushi<<endl;\n\tcout<<ershi<<endl;\n\tcout<<shi<<endl;\n\tcout<<wu<<endl;\n\tcout<<yi<<endl;\n\treturn 0;\n}'

So the data provided is one function per row.

How many labels? 

In [7]:
len(mydf.iloc[:,2].value_counts().index)

104

Distribution of data across labels:

In [8]:
mydf.iloc[:,2].value_counts()

1     501
95    500
10    500
39    500
71    500
     ... 
53    500
85    500
22    500
54    500
32    500
Name: 2, Length: 104, dtype: int64

In [9]:
mydf.iloc[:,2].value_counts().values

array([501, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500,
       500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500])

It is quite evenly distributed with ~500 datapoints/observations for each label

### Question: Can the software vulnerability detection dataset f+rom Russell et al (2018) fit into the structure of this dataset?

In [10]:
val = pd.read_pickle('data/newval.pickle')

In [11]:
val.head()

Unnamed: 0,functionSource,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,combine
0,gwy_resource_class_mkdir(GwyResourceClass *kla...,False,False,False,False,False,False
1,SetIgnoredFields( const char **papszFields )\n...,False,False,False,False,False,False
2,timeoutProtoDisplays(void)\n{\n struct prot...,False,False,False,False,False,False
3,"SelectViewsOf (GraphicComp* comp, Editor* ed) ...",False,False,False,False,False,False
4,multi_get_bool(bool &value)\n{\n\tif (!Multi_s...,False,False,False,False,False,False


In [12]:
val.iloc[0,0]

'gwy_resource_class_mkdir(GwyResourceClass *klass)\n{\n    gchar *path;\n    gint ok;\n\n    g_return_val_if_fail(GWY_IS_RESOURCE_CLASS(klass), FALSE);\n\n    path = g_build_filename(gwy_get_user_dir(), klass->name, NULL);\n    if (g_file_test(path, G_FILE_TEST_IS_DIR)) {\n        g_free(path);\n        return TRUE;\n    }\n\n    ok = !g_mkdir(path, 0700);\n    g_free(path);\n\n    return ok;\n}'

* There's a difference between language.
* The vulnerability dataset is in C and C++
* The source code classification dataset is in C++ however it is clean code from OJ Bench.
* But since they're using `pycparser` as their parser, it will work easily with only on clean code
* To implement this, we have to dive and check each structure transformation that is happening to create the ASTNN

#### Change into same structure like ASTNN dataset (Not important for now)

In [92]:
newdf = val.iloc[:,[0,6]]

In [None]:
newdf['id'] = pd.Series(val.index.values)

In [96]:
newdf = newdf.iloc[:,[2,0,1]]

In [101]:
newdf.iloc[:,2] = newdf.iloc[:,2].map({True:0, False:1})

In [102]:
newdf

Unnamed: 0,id,functionSource,combine
0,0,gwy_resource_class_mkdir(GwyResourceClass *kla...,1
1,1,SetIgnoredFields( const char **papszFields )\n...,1
2,2,timeoutProtoDisplays(void)\n{\n struct prot...,1
3,3,"SelectViewsOf (GraphicComp* comp, Editor* ed) ...",1
4,4,multi_get_bool(bool &value)\n{\n\tif (!Multi_s...,1
...,...,...,...
127471,127471,Java_org_gdal_gdal_gdalJNI_Band_1GetHistogram_...,0
127472,127472,rb_singleton_class(VALUE obj)\n{\n VALUE kl...,1
127473,127473,"clutter_text_set_justify (ClutterText *self,\n...",1
127474,127474,set_sensitive_animation()\n{\n\tGtkWidget *rot...,1


## Diving Deep Into the Code!  (Row/index = 4970) 

In [None]:
from pycparser import c_parser, parse_file
import re

#### The original code

In [8]:
mydf.iloc[4970,1]

"int main()\n{\n int n=0,k=0,i=0,j=0,l=0,m=0;            \n cin>>n;                     \n int a[100000];              \n for(i=0;i<=n-1;i++)         \n  cin>>a[i];\n cin>>k;                     \n while(j < n){               \n  while(a[j] == k &&j<=n-1){  \n   for(l = j; l <= n-2; l++){\n    a[l] = a[l+1];\n   }\n   n--;\n  }\n  j ++;\n }\n for(m=0;m<=n-2;m++)           \n  cout<<a[m]<<' ';\n cout<<a[m];\n return 0;\n}"

Splits the data into train,dev,test sets.  
Parse each source code using `pycparser` to get AST.  
Save it to a pickle file

#### Parsed code (AST) using `pycparser`

Checking the pickle file for training set

In [66]:
train_ast = pd.read_pickle('astnn/data/train/train_.pkl')

In [35]:
train_ast.head()

Unnamed: 0,id,code,label
46887,46887,"FileAST(ext=[FuncDef(decl=Decl(name='main',\n ...",93
4970,4970,"FileAST(ext=[FuncDef(decl=Decl(name='main',\n ...",42
50521,50521,"FileAST(ext=[Decl(name='i',\n ...",77
38406,38406,"FileAST(ext=[FuncDef(decl=Decl(name='main',\n ...",63
44230,44230,"FileAST(ext=[Decl(name='DiJiTian',\n ...",80


In [65]:
train_ast.iloc[1,1]

FileAST(ext=[FuncDef(decl=Decl(name='main',
                               quals=[
                                     ],
                               storage=[
                                       ],
                               funcspec=[
                                        ],
                               type=FuncDecl(args=None,
                                             type=TypeDecl(declname='main',
                                                           quals=[
                                                                 ],
                                                           type=IdentifierType(names=['int'
                                                                                     ]
                                                                               )
                                                           )
                                             ),
                               init=None,
                               

The parsed code is then processed, transformed into sequences and saved in a new file called `programs_ns.tsv`

#### Transformed sequences AST (parsed code)

In [9]:
programs_ns = pd.read_csv('astnn/data/train/programs_ns.tsv',delimiter=',')

In [10]:
programs_ns.head(10)

Unnamed: 0.1,Unnamed: 0,id,code,label
0,46887,46887,,93
1,4970,4970,FileAST Decl ArrayDecl stu Struct Decl ArrayDe...,42
2,50521,50521,,77
3,38406,38406,,63
4,44230,44230,,80
5,21122,21122,FileAST FuncDef Decl FuncDecl main int Compoun...,69
6,31463,31463,,84
7,43363,43363,,88
8,16692,16692,FileAST Decl ArrayDecl a int 1000 FuncDef Decl...,7
9,9937,9937,FileAST FuncDef Decl FuncDecl main int Compoun...,8


In [24]:
pd.value_counts(programs_ns.iloc[:,0] == programs_ns.iloc[:,1])

True    31200
dtype: int64

In [32]:
programs_ns.iloc[1,2]

'FileAST Decl ArrayDecl stu Struct Decl ArrayDecl name char 20 Decl a int Decl b int Decl c1 char Decl c2 char Decl d int 100 FuncDef Decl FuncDecl main int Compound Decl n int Decl i int Decl ArrayDecl p int 100 InitList 0 Decl sum int 0 Decl max int FuncCall scanf ExprList "%d" & n For = i 0 < i n ++ i Compound FuncCall scanf ExprList "%s" StructRef ArrayRef stu i name FuncCall scanf ExprList "%d %d %c %c %d" & StructRef ArrayRef stu i a & StructRef ArrayRef stu i b & StructRef ArrayRef stu i c1 & StructRef ArrayRef stu i c2 & StructRef ArrayRef stu i d End For = i 0 < i n ++ i Compound If && > StructRef ArrayRef stu i a 80 > StructRef ArrayRef stu i d 0 = ArrayRef p i + ArrayRef p i 8000 If && > StructRef ArrayRef stu i a 85 > StructRef ArrayRef stu i b 80 = ArrayRef p i + ArrayRef p i 4000 If > StructRef ArrayRef stu i a 90 = ArrayRef p i + ArrayRef p i 2000 If && > StructRef ArrayRef stu i a 85 == StructRef ArrayRef stu i c2 \'Y\' = ArrayRef p i + ArrayRef p i 1000 If && > StructR

Then, they train trained the Word2Vec model after transforming the AST into sequences

#### Word2Vec: Whats in there?

Checking the embeddings of the trained word2vec model

In [17]:
from gensim.models.word2vec import Word2Vec

w2v = Word2Vec.load('astnn/data/train/embedding/node_w2v_128')

In [23]:
w2v.corpus_count

31200

In [44]:
w2v.wv.vocab

{'FileAST': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2e48>,
 'FuncDef': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2240>,
 'Decl': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c27b8>,
 'FuncDecl': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2710>,
 'main': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2d30>,
 'void': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2908>,
 'Compound': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c22e8>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c28d0>,
 'int': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c29e8>,
 'i3': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c27f0>,
 'i5': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2860>,
 'i7': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2588>,
 'FuncCall': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2a20>,
 'scanf': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c26a0>,
 'ExprList': <gensim.models.keyedvectors.Vocab at 0x7fbf8d8c2780>,
 '"%d"': <gensim.models.keyedvectors.Vocab

In [86]:
w2v.wv.vocab['main'].index

40

In [68]:
## 1st line of block

[w2v.wv.index2word[32], w2v.wv.index2word[2], w2v.wv.index2word[30], w2v.wv.index2word[40], w2v.wv.index2word[4] ]

['FuncDef', 'Decl', 'FuncDecl', 'main', 'int']

In [85]:
## 2nd line of block

w2v.wv.index2word[6]

'Compound'

In [71]:
## 3rd line of block

[w2v.wv.index2word[2], w2v.wv.index2word[13], w2v.wv.index2word[4], w2v.wv.index2word[5]]

['Decl', 'n', 'int', '0']

```
   Decl
   / \
  n   0
 /
int 
```

In [72]:
## 4th line of block

[w2v.wv.index2word[2], w2v.wv.index2word[22], w2v.wv.index2word[4], w2v.wv.index2word[5]]

['Decl', 'k', 'int', '0']

```
   Decl
   / \
  k   0
 /
int 
```

#### Generated block sequences from Word2Vec

* The code blocks: preview Just one row (4970).  
* The generated block statements is a **Multiway Tree**. Therefore, it can have more than 2 children

In [51]:
blk = pd.read_pickle('astnn/data/train/blocks.pkl')

In [60]:
blk.iloc[1,1]

[[32, [2, [30, [40, [4]]]]],
 [6],
 [2, [13, [4]], [5]],
 [2, [22, [4]], [5]],
 [2, [0, [4]], [5]],
 [2, [8, [4]], [5]],
 [2, [54, [4]], [5]],
 [2, [24, [4]], [5]],
 [59, [62], [13]],
 [2, [23, [9, [4]], [201]]],
 [16, [3, [0], [5]], [34, [0], [18, [13], [10]]], [14, [0]]],
 [59, [62], [1, [9], [0]]],
 [59, [62], [22]],
 [91, [20, [8], [13]]],
 [6],
 [91, [31, [19, [1, [9], [8]], [22]], [34, [8], [18, [13], [10]]]]],
 [6],
 [16, [3, [54], [8]], [34, [54], [18, [13], [37]]], [14, [54]]],
 [6],
 [3, [1, [9], [54]], [1, [9], [17, [54], [10]]]],
 [7],
 [79, [13]],
 [7],
 [14, [8]],
 [7],
 [16, [3, [24], [5]], [34, [24], [18, [13], [37]]], [14, [24]]],
 [28, [28, [51], [1, [9], [24]]], [102]],
 [28, [51], [1, [9], [24]]],
 [39, [5]],
 [7]]

So it is a python list, where each element in the list is a **single statement lists**

####  Comparison with orginal source code

```python
int main()  
{  
 int n=0,k=0,i=0,j=0,l=0,m=0;            
 cin>>n;                     
 int a[100000];              
 for(i=0;i<=n-1;i++)         
  cin>>a[i];
 cin>>k;                     
 while(j < n){               
  while(a[j] == k &&j<=n-1){  
   for(l = j; l <= n-2; l++){
    a[l] = a[l+1];
   }
   n--;
  }
  j ++;
 }
 for(m=0;m<=n-2;m++)           
  cout<<a[m]<<' ';
 cout<<a[m];
 return 0;
}
```

### Word2Vec Embeddings weights ?

In [113]:
w2v.wv.vectors

array([[-0.04341486,  0.16446285, -0.23305991, ...,  0.2727674 ,
         0.02384077, -0.23563723],
       [-0.18621117,  0.33001685, -0.20516539, ...,  0.05864863,
         0.04645239, -0.3211731 ],
       [-0.22320849, -0.05841545, -0.29816398, ...,  0.17411198,
        -0.07813436, -0.26289403],
       ...,
       [-0.10280308,  0.02005816, -0.13732348, ...,  0.04655784,
        -0.11028833, -0.05753554],
       [-0.10130277, -0.01562448, -0.15953381, ...,  0.02244846,
        -0.17570926, -0.07128225],
       [-0.0479025 , -0.04194906, -0.18860194, ...,  0.08072229,
        -0.05621766, -0.03924392]], dtype=float32)

In [114]:
w2v.wv.vectors.shape

(8188, 128)