In [2]:
import sqlite3
import pandas
import numpy as np
import networkx as nx
from tqdm.autonotebook import tqdm
import pickle



# Establich  connection

In [70]:
conn = sqlite3.connect("D:/java-source-graph-v2/java-source-graph-v2.srctrldb")

In [71]:
cursor = conn.cursor()

# Fetch nodes and edges from db

In [6]:
sql = "SELECT * FROM node"
cursor.execute(sql)
nodes = cursor.fetchall()

In [7]:
len(nodes)

504154

In [8]:
sql = "SELECT * FROM edge"
cursor.execute(sql)
edges = cursor.fetchall()

In [9]:
len(edges)

3239487

# Prepare and Preprocess nodes

In [10]:
def parse(s):
    s = s.split('\t')
    s = list(filter(lambda x: x not in ['.', 's', 'p', ''], s))
    s = [x[1:] for x in s]
    return '.'.join(s)

In [11]:
nodes = pandas.DataFrame(nodes, columns=['id', 'type', 'name']).set_index('id')

In [12]:
nodes['name'] = nodes['name'].apply(parse)

In [13]:
np.unique(nodes['type'].values)

array([     1,      4,     32,    128,    256,    512,   2048,   8192,
        16384,  32768, 131072, 262144], dtype=int64)

In [14]:
id = 0
tp = 8192
nodes[nodes['type']==tp].iloc[id]['name'], nodes[nodes['type']==tp].iloc[id].name

('edu.stanford.nlp.coref.hybrid.ChineseCorefBenchmarkSlowITest.runCorefTest.static java.lang.String.(boolean)',
 34)

In [15]:
get_source(nodes[nodes['type']==tp].iloc[id].name)

NameError: name 'get_source' is not defined

# Prepare edges

In [16]:
edges = pandas.DataFrame(edges, columns=['id', 'type', 'src_node', 'trg_node']).set_index('id')

In [17]:
np.unique(edges['type'].values)

array([   1,    2,    4,    8,   16,   32,   64,  512, 4096], dtype=int64)

# Save or load nodes and edges from csv 

In [18]:
nodes.to_csv("java-nodes_table.csv")

In [3]:
nodes = pandas.read_csv("java-nodes_table.csv").set_index('id')

FileNotFoundError: [Errno 2] File b'java-nodes_table.csv' does not exist: b'java-nodes_table.csv'

In [19]:
edges.to_csv("java-edges_tables.csv")

In [3]:
edges = pandas.read_csv("java-edges_tables.csv").set_index('id')

# Build graph from edges with type 8

In [19]:
G = nx.DiGraph()

In [198]:
for edge in edges[edges['type'] == 8].values:
    G.add_edge(edge[1], edge[2])

In [443]:
with open('java-nx_fc_graph.pickle', 'wb') as f:
    pickle.dump(G, f)

In [4]:
with open('java-nx_fc_graph.pickle', 'rb') as f:
    G = pickle.load(f)

# Build mapping from function nodes to their content

In [102]:
node2content = dict()
success_cnt = 0
total = 0
failed = []

In [103]:
for i, node in tqdm(nodes.iterrows(), total = len(nodes)):
    if node['type'] not in [8192]:
        continue
    total += 1
    if total % 10000 == 0:
        print(success_cnt, '/', total)
    cursor.execute(f'SELECT * FROM occurrence WHERE element_id = {i}')
    occurrences = cursor.fetchall()
    if len(occurrences) < 2:
        failed.append((i, len(occurrences)))
        continue
    cursor.execute(f'SELECT * FROM source_location WHERE id = {occurrences[1][1]}')
    source_locations = cursor.fetchall()
    if len(source_locations) != 1:
        failed.append((i, -1))
        continue
    source_location = source_locations[0]
    cursor.execute(f'SELECT * FROM filecontent WHERE id = {source_location[1]}')
    file_contents = cursor.fetchall()
    if len(file_contents) != 1:
        failed.append((i, -2))
        continue
    file_content = file_contents[0][1].split('\n')
    start_line = source_location[2] - 1
    end_line = source_location[4]
    content = file_content[start_line : end_line]
    tabs = len(content[0]) - len(content[0].lstrip())
    content = [x[tabs:] for x in content]
    #content[-1] = ''
    content = '\n'.join(content)
    if len(content) == 0:
        failed.append((i, -3))
        continue
    node2content[i] = content
    success_cnt += 1

HBox(children=(IntProgress(value=0, max=504154), HTML(value='')))

8936 / 10000
18487 / 20000
27572 / 30000
36951 / 40000
46667 / 50000
56164 / 60000
65990 / 70000
75961 / 80000
85510 / 90000
94798 / 100000
104411 / 110000
113880 / 120000
123390 / 130000
132900 / 140000
142139 / 150000
151470 / 160000
160770 / 170000
170347 / 180000
179855 / 190000
189501 / 200000
199139 / 210000
208690 / 220000
218184 / 230000
227846 / 240000
237329 / 250000
246752 / 260000
256515 / 270000
266103 / 280000



In [104]:
with open('java-node2content.pickle', 'wb') as f:
    pickle.dump(node2content, f)

In [20]:
with open('java-node2content.pickle', 'rb') as f:
    node2content = pickle.load(f)

# Look at failed mappings

In [218]:
len(failed)

38940

In [219]:
failed = pandas.DataFrame(failed, columns=['id', 'error_type'])

In [220]:
np.unique(failed['error_type'].values, return_counts=True)

(array([-3,  0,  1], dtype=int64), array([24723, 13860,   357], dtype=int64))

In [207]:
failed[failed['error_type'] == 3]

Unnamed: 0,id,error_type
0,34,3
4,42,3
11,56,3
13,58,3
14,60,3
16,81,3
17,84,3
18,85,3
19,86,3
20,87,3


# Look at source of node

In [96]:
cursor.execute(f'SELECT * FROM occurrence WHERE element_id = 11723210')
occurrences = cursor.fetchall()
occurrences

[(11723210, 7926893), (11723210, 7926894), (11723210, 7926897)]

In [97]:
cursor.execute(f'SELECT * FROM source_location WHERE id = {occurrences[1][1]}')
source_locations = cursor.fetchall()
source_locations

[(7926894, 11723202, 163, 1, 179, 3, 1)]

In [101]:
cursor.execute(f'SELECT * FROM filecontent WHERE id = {source_locations[0][1]}')
file_contents = cursor.fetchall()
file_content = file_contents[0][1].split('\n')
start_line = source_locations[0][2] - 1
end_line = source_locations[0][4]
content = file_content[start_line : end_line]
tabs = len(content[0]) - len(content[0].lstrip())
content = [x[tabs:] for x in content]
#content[-1] = ''
print('\n'.join(content))

@Override
public String[] getOptions() {

  Vector<String> result = new Vector<String>();

  result.add("-filter");
  if (getFilter() instanceof OptionHandler) {
    result.add(getFilter().getClass().getName() + " "
      + Utils.joinOptions(((OptionHandler) getFilter()).getOptions()));
  } else {
    result.add(getFilter().getClass().getName());
  }

  Collections.addAll(result, super.getOptions());

  return result.toArray(new String[result.size()]);
}


In [99]:
print('\n'.join(file_content))

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * PLSClassifier.java
 * Copyright (C) 2006,2015 University of Waikato, Hamilton, New Zealand
 */

package weka.classifiers.functions;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.RandomizableClassifier;
import weka.core.Capabilitie




In [105]:
def get_source(node_id):
    if node_id in node2content:
        return node2content[node_id]
    cursor.execute(f'SELECT * FROM occurrence WHERE element_id = {node_id}')
    occurrences = cursor.fetchall()
    if len(occurrences) < 2:
        return 'No content'
    cursor.execute(f'SELECT * FROM source_location WHERE id = {occurrences[1][1]}')
    source_locations = cursor.fetchall()
    source_locations
    cursor.execute(f'SELECT * FROM filecontent WHERE id = {source_locations[0][1]}')
    file_contents = cursor.fetchall()
    file_content = file_contents[0][1].split('\n')
    start_line = source_locations[0][2] - 1
    end_line = source_locations[0][4]
    content = file_content[start_line : end_line]
    tabs = len(content[0]) - len(content[0].lstrip())
    content = [x[tabs:] for x in content]
    #content[-1] = ''
    return '\n'.join(content)

# Look at common and not common nodes, which are with content and in graph

In [32]:
nodes_in_graph = set(G.nodes)

In [33]:
nodes_with_content = set(node2content.keys())

In [34]:
len(nodes_in_graph & nodes_with_content)

191160

In [35]:
len(nodes_in_graph | nodes_with_content)

247620

In [36]:
len(nodes_in_graph), len(nodes_with_content)

(218116, 220664)

In [21]:
len(nodes_with_content - nodes_in_graph)

29504

In [22]:
len(nodes_in_graph - nodes_with_content)

27952

# Remove failed nodes which have not 0 type failure

In [26]:
failed_n0 = set([x[0] for x in failed.values if x[1] != 0])

In [27]:
len(failed_n0)

1377

In [28]:
G.remove_nodes_from(failed_n0)

In [209]:
len(G.nodes), len(G.edges)

(218116, 605611)

In [210]:
failed_0 = [x[0] for x in failed.values if x[1] == 0]

In [214]:
len(failed_0)

15599

In [229]:
nodes.loc[2281510]

type                                       4096
name    sklearn.utils._cython_blas._dot_memview
Name: 2281510, dtype: object

# Remove nodes which have no content

In [212]:
G.remove_nodes_from(nodes_in_graph - nodes_with_content)

In [213]:
len(G.nodes), len(G.edges)

(191160, 295270)

# Get all available call edges

In [21]:
#available edge types: 1,    2,    4,    8,   16,   32,   64,  512, 4096
call_edges = dict()
for edge_type in [1,    2,    4,    8,   16,   32,   64,  512, 4096]:
    tmp = []
    for i, edge in tqdm(edges[edges['type'] == edge_type].iterrows(), total=len(edges[edges['type'] == edge_type])):
        if nodes.loc[edge.src_node].type in [8192] and nodes.loc[edge.trg_node].type in [8192]:
            tmp.append(i)
    call_edges[edge_type] = tmp

HBox(children=(IntProgress(value=0, max=475303), HTML(value='')))




HBox(children=(IntProgress(value=0, max=991812), HTML(value='')))




HBox(children=(IntProgress(value=0, max=369521), HTML(value='')))




HBox(children=(IntProgress(value=0, max=885138), HTML(value='')))




HBox(children=(IntProgress(value=0, max=34089), HTML(value='')))




HBox(children=(IntProgress(value=0, max=75186), HTML(value='')))




HBox(children=(IntProgress(value=0, max=24193), HTML(value='')))




HBox(children=(IntProgress(value=0, max=214525), HTML(value='')))




HBox(children=(IntProgress(value=0, max=169720), HTML(value='')))




In [68]:
with open('java-call-edges.pickle', 'wb') as f:
    pickle.dump(call_edges, f)

In [26]:
for edge_type in [1,    2,    4,    8,   16,   32,   64,  512, 4096]:
    print(f'{edge_type}:{len(call_edges[edge_type])}')

1:0
2:0
4:5904
8:797406
16:0
32:75186
64:0
512:0
4096:0


In [9]:
for edge_id in call_edges[2]:
    edge = edges.loc[edge_id]
    if get_source(edge.trg_node).split()[0] != 'class':
        print(edge_id)

NameError: name 'cursor' is not defined

# Get call edge information

- type 4: call as lambda
- type 8: typical call
- type 32: implementation to interface

In [81]:
call_edges[8]

[155,
 157,
 159,
 161,
 164,
 168,
 173,
 175,
 177,
 185,
 188,
 190,
 193,
 195,
 249,
 256,
 257,
 259,
 262,
 264,
 278,
 282,
 286,
 288,
 291,
 294,
 297,
 298,
 302,
 304,
 327,
 341,
 346,
 348,
 349,
 1085,
 1086,
 1087,
 1088,
 1090,
 1092,
 1094,
 1095,
 1096,
 1102,
 1104,
 1105,
 1108,
 1109,
 1159,
 1162,
 1163,
 1165,
 1167,
 1169,
 1170,
 1172,
 1173,
 1196,
 1197,
 1198,
 1200,
 1213,
 1218,
 1886,
 1888,
 1890,
 1900,
 1902,
 1906,
 1927,
 1931,
 1934,
 1936,
 1937,
 1939,
 1942,
 1945,
 1946,
 1948,
 1950,
 1952,
 1963,
 1966,
 1970,
 1973,
 1975,
 1977,
 1979,
 1981,
 1983,
 1986,
 1994,
 1996,
 1998,
 2002,
 2013,
 2017,
 2033,
 2036,
 2038,
 2039,
 2042,
 2045,
 2052,
 2055,
 2062,
 2067,
 2068,
 2074,
 2078,
 2081,
 2082,
 2084,
 2085,
 2087,
 2091,
 2093,
 2753,
 2754,
 2756,
 2759,
 2762,
 2768,
 2769,
 2776,
 2777,
 2784,
 2786,
 2788,
 2791,
 2792,
 2794,
 2796,
 2797,
 2798,
 2801,
 2804,
 2806,
 2808,
 2816,
 2818,
 2824,
 2827,
 2829,
 2833,
 2835,
 2837,

In [85]:
edge = edges.loc[14158]
print((edge.src_node, edge.trg_node))
print((nodes.loc[edge.src_node]['name'], nodes.loc[edge.src_node]['type']))
print((nodes.loc[edge.trg_node]['name'], nodes.loc[edge.trg_node]['type']))

(13865, 13851)
('edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcherITest.testTokenSequenceMatcher2.void.()', 8192)
('edu.stanford.nlp.ling.tokensregex.BasicSequenceMatchResult<T>.group.java.lang.String.()', 8192)


In [86]:
print(get_source(edge.src_node))
print(get_source(edge.trg_node))

public void testTokenSequenceMatcher2() throws IOException {
  CoreMap doc = createDocument(testText1);
  TokenSequencePattern p = TokenSequencePattern.compile(
                  getSequencePatternExpr(".*", ".*", "of", ".*"));

  TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
  boolean match = m.find();
  assertTrue(match);
  assertEquals(0, m.groupCount());
  assertEquals("first Bishop of London", m.group());
  match = m.find();
  assertTrue(match);
  assertEquals(0, m.groupCount());
  assertEquals("third Archbishop of Canterbury", m.group());
  match = m.find();
  assertTrue(match);
  assertEquals(0, m.groupCount());
  assertEquals("a member of the", m.group());
  match = m.find();
  assertTrue(match);
  assertEquals(0, m.groupCount());
  assertEquals("as Bishop of London", m.group());
  match = m.find();
  assertFalse(match);

  // Test sequence with groups
  p = TokenSequencePattern.compile(
                  new SequencePattern.SequencePat

# 123

In [127]:
bad_nodes = []
bad_edges = []
for edge_id in tqdm(call_edges[8]):
    edge = edges.loc[edge_id]
    c1 = get_source(edge.src_node)
    c2 = get_source(edge.trg_node)
    b = False
    if c1.find('{') == -1:
        b = True
        bad_nodes.append(edge.src_node)
#     if c2.find('{') == -1 and c2 != 'No content':
#         b = True
#         bad_nodes.append(edge.trg_node) 
    if b:
        bad_edges.append(edge_id)

HBox(children=(IntProgress(value=0, max=797406), HTML(value='')))

In [122]:
print(get_source(4097187))

private native void allocate(@ByRef ConstantDataBuffer shapes, @ByRef ConstantDataBuffer offets, @Cast("Nd4jLong") long numTads);


In [128]:
len(set(bad_nodes))

150

In [129]:
len(set(bad_edges))

219

# Filter edges with type 8

In [None]:
no_source_src_nodes = []
good_call_edges = []
potential_call_edge = []
fail_edge = []
fail2_edge = []
for edge_id in tqdm(call_edges[8]):
    edge = edges.loc[edge_id]
    src_name = nodes.loc[edge.src_node]['name'].split('.')[-1]
    trg_name = nodes.loc[edge.trg_node]['name'].split('.')
    if edge.src_node not in node2content:
        no_source_src_nodes.append(edge.src_node)
    elif edge.trg_node not in node2content:
        src_source = ''.join(node2content[edge.src_node].split())
        start = src_source.find(trg_name[-1] + '(')
        start2 = src_source.find(trg_name[-2] + '(')
        if start == -1 and start2 == -1:
            fail_edge.append((edge_id, src_source, trg_name))
            continue
        potential_call_edge.append(edge_id)
    else:
        src_source = ''.join(node2content[edge.src_node].split())
        start = src_source.find(trg_name[-1] + '(')
        start2 = src_source.find(trg_name[-2] + '(')
        if start == -1 and start2 == -1:
            fail2_edge.append((edge_id, src_source, trg_name))
            continue
        good_call_edges.append(edge_id)

HBox(children=(IntProgress(value=0, max=589195), HTML(value='')))

In [68]:
len(good_call_edges) + len(potential_call_edge)

569890

In [69]:
len(fail_edge), len(fail2_edge), len(no_source_src_nodes)

(14353, 2978, 1974)

# Filters edges with type 512

In [34]:
no_source_src_nodes512 = []
good_call_edges512 = []
potential_call_edge512 = []
fail_edge512 = []
fail2_edge512 = []
for edge_id in tqdm(call_edges[512]):
    edge = edges.loc[edge_id]
    src_name = nodes.loc[edge.src_node]['name'].split('.')[-1]
    trg_name = nodes.loc[edge.trg_node]['name'].split('.')
    if edge.src_node not in node2content:
        no_source_src_nodes512.append(edge.src_node)
    elif edge.trg_node not in node2content:
        src_source = node2content[edge.src_node]
        start = src_source.find(trg_name[-1] + '(')
        start2 = src_source.find(trg_name[-2] + '(')
        if start == -1 and start2 == -1:
            fail_edge512.append((edge_id, src_source, trg_name))
            continue
        potential_call_edge512.append(edge_id)
    else:
        good_call_edges512.append(edge_id)
        if start == -1 and start2 == -1:
            fail2_edge512.append((edge_id, src_source, trg_name))
            continue
        potential_call_edge512.append(edge_id)

HBox(children=(IntProgress(value=0, max=1093), HTML(value='')))




In [38]:
len(good_call_edges512) + len(potential_call_edge512)

1066

In [39]:
len(fail_edge512), len(no_source_src_nodes512)

(23, 4)

# Find ambigious function call edges

In [None]:
edges_4 = edges[edges['type'] == 4]

In [861]:
ambiguous_edges = []
cnt = 0

In [None]:
for i, edge in tqdm(edges_4.iterrows(), total=len(edges_4)):
    cnt += 1
    if (cnt % 100000 == 0):
        print(len(ambiguous_edges), '/', cnt)
    if edge.src_node in nodes_with_content and edge.trg_node in nodes_with_content:
        ambiguous_edges.append((i, edge.src_node, edge.trg_node))

HBox(children=(IntProgress(value=0, max=69675897), HTML(value='')))

2 / 100000
3 / 200000
4 / 300000
22 / 400000
26978 / 500000
85013 / 600000
134159 / 700000
176996 / 800000
210036 / 900000
256216 / 1000000
312872 / 1100000
369493 / 1200000
412398 / 1300000
449468 / 1400000
493760 / 1500000
540487 / 1600000
594561 / 1700000
653235 / 1800000
692584 / 1900000
737651 / 2000000
792872 / 2100000
847678 / 2200000
914069 / 2300000
987689 / 2400000
1051360 / 2500000
1101104 / 2600000
1141964 / 2700000
1175287 / 2800000
1210935 / 2900000
1246152 / 3000000
1285366 / 3100000
1328458 / 3200000
1372250 / 3300000
1411017 / 3400000
1453225 / 3500000
1502401 / 3600000
1529778 / 3700000
1566145 / 3800000
1613563 / 3900000
1644177 / 4000000
1669106 / 4100000
1695719 / 4200000
1718544 / 4300000
1743023 / 4400000
1769475 / 4500000
1792483 / 4600000
1830699 / 4700000
1852601 / 4800000
1880008 / 4900000
1905974 / 5000000
1943077 / 5100000
1984427 / 5200000
2008519 / 5300000
2033969 / 5400000
2060125 / 5500000
2083467 / 5600000
2108412 / 5700000
2138815 / 5800000
2162226 / 

In [868]:
len(ambiguous_edges)

26619205

In [880]:
ambiguous_edges = pandas.DataFrame(ambiguous_edges, columns=['id', 'src_node', 'trg_node']).set_index('id')

In [881]:
ambiguous_edges.to_csv("ambiguous_edges_tables.csv")