In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import json

# Data Preprocessing on snippet level python code summarization

Observations:
- In main, usually it will interpret "Driver Code"
- There are a little noise in the file, but not to many. E.g. def maxPresum(a,b): NEW_LINE interprets "Python3 implementation of the above approach"

Data samples:
- Training data: 81.2K code snippets - summarization pairs
- Validation data: 3.9K pairs
- Test data: 7.3K pairs

Output:
- It will be .json files storing 1 to 1 match from python code to summarization in words.
- The output will be named as "train_sum_snippet.json", "val_sum_snippet.json", and "test_sum_snippet.json"

Training dataset preprocessing

In [28]:
# path of snippet level summarization
py_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_1_comment/Python-comment/train-Python-comment-tok.py'
txt_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_1_comment/Python-comment/train-Python-comment-tok.txt'

In [29]:
# Open and read .py file
with open(py_file, 'r') as py_file:
    py_lines = py_file.readlines()

# Open and read .txt file
with open(txt_file, 'r') as txt_file:
    txt_lines = txt_file.readlines()

In [30]:
mapped_data = []
for py_line, txt_line in zip(py_lines, txt_lines):
    mapped_data.append({
        "py": py_line.strip(),
        "explain": txt_line.strip()
    })

In [12]:
#print the first 50 mappings
for mapping in mapped_data[:50]:
    print(f'py: {mapping["py"]}\nexplain: {mapping["explain"]}\n')

py: def maxPresum ( a , b ) : NEW_LINE
explain: Python3 implementation of the above approach

py: X = max ( a [ 0 ] , 0 ) NEW_LINE
explain: Stores the maximum prefix sum of the array A [ ]

py: for i in range ( 1 , len ( a ) ) : NEW_LINE INDENT a [ i ] += a [ i - 1 ] NEW_LINE X = max ( X , a [ i ] ) NEW_LINE DEDENT
explain: Traverse the array A [ ]

py: Y = max ( b [ 0 ] , 0 ) NEW_LINE
explain: Stores the maximum prefix sum of the array B [ ]

py: for i in range ( 1 , len ( b ) ) : NEW_LINE INDENT b [ i ] += b [ i - 1 ] NEW_LINE Y = max ( Y , b [ i ] ) NEW_LINE DEDENT return X + Y NEW_LINE
explain: Traverse the array B [ ]

py: A = [ 2 , - 1 , 4 , - 5 ] NEW_LINE B = [ 4 , - 3 , 12 , 4 , - 3 ] NEW_LINE print ( maxPresum ( A , B ) ) NEW_LINE
explain: Driver code

py: import math NEW_LINE
explain: Python3 program for the above approach

py: def sumOfTwoCubes ( n ) : NEW_LINE INDENT lo = 1 NEW_LINE hi = round ( math . pow ( n , 1 / 3 ) ) NEW_LINE while ( lo <= hi ) : NEW_LINE INDENT curr =

In [31]:
len(mapped_data)

81207

In [15]:
# export to .json file
output_json_path = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/processed_data/train_sum_snippet.json'

# Write the mapped data to a .json file
with open(output_json_path, 'w') as json_file:
    json.dump(mapped_data, json_file, indent=4)

Val dataset preprocessing

In [16]:
# path of snippet level summarization
py_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_1_comment/Python-comment/val-Python-comment-tok.py'
txt_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_1_comment/Python-comment/val-Python-comment-tok.txt'

In [17]:
# Open and read .py file
with open(py_file, 'r') as py_file:
    py_lines = py_file.readlines()

# Open and read .txt file
with open(txt_file, 'r') as txt_file:
    txt_lines = txt_file.readlines()

In [18]:
mapped_data = []
for py_line, txt_line in zip(py_lines, txt_lines):
    mapped_data.append({
        "py": py_line.strip(),
        "explain": txt_line.strip()
    })

In [19]:
#print the first 50 mappings
for mapping in mapped_data[:50]:
    print(f'py: {mapping["py"]}\nexplain: {mapping["explain"]}\n')

py: def Conversion ( centi ) : NEW_LINE INDENT pixels = ( 96 * centi ) / 2.54 NEW_LINE print ( round ( pixels , 2 ) ) NEW_LINE DEDENT
explain: Function to convert centimeters to pixels

py: centi = 15 NEW_LINE Conversion ( centi ) NEW_LINE
explain: Driver Code

py: def xor_operations ( N , arr , M , K ) : NEW_LINE
explain: Method that returns the corresponding output by taking the given inputs .

py: if M < 0 or M >= N : NEW_LINE INDENT return - 1 NEW_LINE DEDENT
explain: If this condition is satisfied , value of M is invalid

py: if K < 0 or K >= N - M : NEW_LINE INDENT return - 1 NEW_LINE DEDENT
explain: Check if index K is valid

py: for _ in range ( M ) : NEW_LINE
explain: Loop to perform M operations

py: temp = [ ] NEW_LINE
explain: Creating a temporary list

py: for i in range ( len ( arr ) - 1 ) : NEW_LINE
explain: Traversing the array

py: value = arr [ i ] ^ arr [ i + 1 ] NEW_LINE
explain: Calculate XOR values of adjacent elements

py: temp . append ( value ) NEW_LINE
explain

In [20]:
len(mapped_data)

3946

In [21]:
# export to .json file
output_json_path = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/processed_data/val_sum_snippet.json'

# Write the mapped data to a .json file
with open(output_json_path, 'w') as json_file:
    json.dump(mapped_data, json_file, indent=4)

Test dataset preprocessing

In [22]:
# path of snippet level summarization
py_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_1_comment/Python-comment/test-Python-comment-tok.py'
txt_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_1_comment/Python-comment/test-Python-comment-tok.txt'

In [23]:
# Open and read .py file
with open(py_file, 'r') as py_file:
    py_lines = py_file.readlines()

# Open and read .txt file
with open(txt_file, 'r') as txt_file:
    txt_lines = txt_file.readlines()

In [24]:
mapped_data = []
for py_line, txt_line in zip(py_lines, txt_lines):
    mapped_data.append({
        "py": py_line.strip(),
        "explain": txt_line.strip()
    })

In [25]:
#print the first 50 mappings
for mapping in mapped_data[:50]:
    print(f'py: {mapping["py"]}\nexplain: {mapping["explain"]}\n')

py: def minSum ( A , N ) : NEW_LINE
explain: Function to find minimum sum after deletion

py: mp = { } NEW_LINE sum = 0 NEW_LINE
explain: Stores frequency of array elements

py: for i in range ( N ) : NEW_LINE
explain: Traverse the array

py: sum += A [ i ] NEW_LINE
explain: Calculate sum

py: if A [ i ] in mp : NEW_LINE INDENT mp [ A [ i ] ] += 1 NEW_LINE DEDENT else : NEW_LINE INDENT mp [ A [ i ] ] = 1 NEW_LINE DEDENT
explain: Update frequency of the current element

py: minSum = float ( ' inf ' ) NEW_LINE
explain: Stores the minimum sum required

py: for it in mp : NEW_LINE
explain: Traverse map

py: minSum = min ( minSum , sum - ( it * mp [ it ] ) ) NEW_LINE
explain: Find the minimum sum obtained

py: return minSum NEW_LINE
explain: Return minimum sum

py: arr = [ 4 , 5 , 6 , 6 ] NEW_LINE
explain: Input array

py: N = len ( arr ) NEW_LINE print ( minSum ( arr , N ) ) NEW_LINE
explain: Size of array

py: def maxAdjacent ( arr , N ) : NEW_LINE INDENT res = [ ] NEW_LINE DEDENT
explain

In [27]:
len(mapped_data)

7293

In [26]:
# export to .json file
output_json_path = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/processed_data/test_sum_snippet.json'

# Write the mapped data to a .json file
with open(output_json_path, 'w') as json_file:
    json.dump(mapped_data, json_file, indent=4)

# Data Preprocessing on program level python code summarization

Observations:
- Since it is a program level code, it gives a high level overview of what the program does (in a few words)
- There are a little noise in the file, but not to many.

Data Samples:
- Training data (program) includes 9.3K pairs of program code - summarization words
- Validation data includes 472 pairs of samples
- Test data includes 887 pairs of samples

Output:
- It will be .json files storing 1 to 1 match from python code to summarization in words.
- The output will be named as "train_sum_program.json", "val_sum_program.json", and "test_sum_program.json"

Training dataset preprocessing

In [49]:
# path of snippet level summarization
py_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_full_desc/Python-desc/train-Python-desc-tok.py'
txt_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_full_desc/Python-desc/train-Python-desc-tok.txt'

In [50]:
# Open and read .py file
with open(py_file, 'r') as py_file:
    py_lines = py_file.readlines()

# Open and read .txt file
with open(txt_file, 'r') as txt_file:
    txt_lines = txt_file.readlines()

In [51]:
mapped_data = []
for py_line, txt_line in zip(py_lines, txt_lines):
    mapped_data.append({
        "py": py_line.strip(),
        "explain": txt_line.strip()
    })

In [52]:
#print the first 50 mappings
for mapping in mapped_data[:50]:
    print(f'py: {mapping["py"]}\nexplain: {mapping["explain"]}\n')

py: def maxPresum ( a , b ) : NEW_LINE INDENT X = max ( a [ 0 ] , 0 ) NEW_LINE for i in range ( 1 , len ( a ) ) : NEW_LINE INDENT a [ i ] += a [ i - 1 ] NEW_LINE X = max ( X , a [ i ] ) NEW_LINE DEDENT Y = max ( b [ 0 ] , 0 ) NEW_LINE for i in range ( 1 , len ( b ) ) : NEW_LINE INDENT b [ i ] += b [ i - 1 ] NEW_LINE Y = max ( Y , b [ i ] ) NEW_LINE DEDENT return X + Y NEW_LINE DEDENT A = [ 2 , - 1 , 4 , - 5 ] NEW_LINE B = [ 4 , - 3 , 12 , 4 , - 3 ] NEW_LINE print ( maxPresum ( A , B ) ) NEW_LINE
explain: Maximum Prefix Sum possible by merging two given arrays

py: import math NEW_LINE def sumOfTwoCubes ( n ) : NEW_LINE INDENT lo = 1 NEW_LINE hi = round ( math . pow ( n , 1 / 3 ) ) NEW_LINE while ( lo <= hi ) : NEW_LINE INDENT curr = ( lo * lo * lo + hi * hi * hi ) NEW_LINE if ( curr == n ) : NEW_LINE INDENT return True NEW_LINE DEDENT if ( curr < n ) : NEW_LINE INDENT lo += 1 NEW_LINE DEDENT else : NEW_LINE INDENT hi -= 1 NEW_LINE DEDENT DEDENT return False NEW_LINE DEDENT N = 28 NEW_L

In [53]:
len(mapped_data)

9263

In [54]:
# export to .json file
output_json_path = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/processed_data/train_sum_program.json'

# Write the mapped data to a .json file
with open(output_json_path, 'w') as json_file:
    json.dump(mapped_data, json_file, indent=4)

Validation dataset preprocessing

In [37]:
# path of snippet level summarization
py_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_full_desc/Python-desc/val-Python-desc-tok.py'
txt_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_full_desc/Python-desc/val-Python-desc-tok.txt'

In [38]:
# Open and read .py file
with open(py_file, 'r') as py_file:
    py_lines = py_file.readlines()

# Open and read .txt file
with open(txt_file, 'r') as txt_file:
    txt_lines = txt_file.readlines()

In [39]:
mapped_data = []
for py_line, txt_line in zip(py_lines, txt_lines):
    mapped_data.append({
        "py": py_line.strip(),
        "explain": txt_line.strip()
    })

In [40]:
#print the first 50 mappings
for mapping in mapped_data[:50]:
    print(f'py: {mapping["py"]}\nexplain: {mapping["explain"]}\n')

py: def Conversion ( centi ) : NEW_LINE INDENT pixels = ( 96 * centi ) / 2.54 NEW_LINE print ( round ( pixels , 2 ) ) NEW_LINE DEDENT centi = 15 NEW_LINE Conversion ( centi ) NEW_LINE
explain: Program to convert Centimeters to Pixels

py: def xor_operations ( N , arr , M , K ) : NEW_LINE INDENT if M < 0 or M >= N : NEW_LINE INDENT return - 1 NEW_LINE DEDENT if K < 0 or K >= N - M : NEW_LINE INDENT return - 1 NEW_LINE DEDENT for _ in range ( M ) : NEW_LINE INDENT temp = [ ] NEW_LINE for i in range ( len ( arr ) - 1 ) : NEW_LINE INDENT value = arr [ i ] ^ arr [ i + 1 ] NEW_LINE temp . append ( value ) NEW_LINE DEDENT arr = temp [ : ] NEW_LINE DEDENT ans = arr [ K ] NEW_LINE return ans NEW_LINE DEDENT N = 5 NEW_LINE arr = [ 1 , 4 , 5 , 6 , 7 ] NEW_LINE M = 1 NEW_LINE K = 2 NEW_LINE print ( xor_operations ( N , arr , M , K ) ) NEW_LINE
explain: Kth array element after M replacements of array elements by XOR of adjacent pairs

py: def canBreakN ( n ) : NEW_LINE INDENT for i in range ( 2 , n

In [41]:
len(mapped_data)

472

In [42]:
# export to .json file
output_json_path = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/processed_data/val_sum_program.json'

# Write the mapped data to a .json file
with open(output_json_path, 'w') as json_file:
    json.dump(mapped_data, json_file, indent=4)

Test data preprocessing

In [43]:
# path of snippet level summarization
py_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_full_desc/Python-desc/test-Python-desc-tok.py'
txt_file = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/pair_data_tok_full_desc/Python-desc/test-Python-desc-tok.txt'

In [44]:
# Open and read .py file
with open(py_file, 'r') as py_file:
    py_lines = py_file.readlines()

# Open and read .txt file
with open(txt_file, 'r') as txt_file:
    txt_lines = txt_file.readlines()

In [45]:
mapped_data = []
for py_line, txt_line in zip(py_lines, txt_lines):
    mapped_data.append({
        "py": py_line.strip(),
        "explain": txt_line.strip()
    })

In [46]:
#print the first 50 mappings
for mapping in mapped_data[:50]:
    print(f'py: {mapping["py"]}\nexplain: {mapping["explain"]}\n')

py: def minSum ( A , N ) : NEW_LINE INDENT mp = { } NEW_LINE sum = 0 NEW_LINE for i in range ( N ) : NEW_LINE INDENT sum += A [ i ] NEW_LINE if A [ i ] in mp : NEW_LINE INDENT mp [ A [ i ] ] += 1 NEW_LINE DEDENT else : NEW_LINE INDENT mp [ A [ i ] ] = 1 NEW_LINE DEDENT DEDENT minSum = float ( ' inf ' ) NEW_LINE for it in mp : NEW_LINE INDENT minSum = min ( minSum , sum - ( it * mp [ it ] ) ) NEW_LINE DEDENT return minSum NEW_LINE DEDENT arr = [ 4 , 5 , 6 , 6 ] NEW_LINE N = len ( arr ) NEW_LINE print ( minSum ( arr , N ) ) NEW_LINE
explain: Minimum sum possible by removing all occurrences of any array element

py: def maxAdjacent ( arr , N ) : NEW_LINE INDENT res = [ ] NEW_LINE for i in range ( 1 , N - 1 ) : NEW_LINE INDENT prev = arr [ 0 ] NEW_LINE maxi = - 1 * float ( ' inf ' ) NEW_LINE for j in range ( 1 , N ) : NEW_LINE INDENT if ( i == j ) : NEW_LINE INDENT continue NEW_LINE DEDENT maxi = max ( maxi , abs ( arr [ j ] - prev ) ) NEW_LINE prev = arr [ j ] NEW_LINE DEDENT res . append

In [47]:
len(mapped_data)

887

In [48]:
# export to .json file
output_json_path = '/content/drive/Shareddrives/CS224G Project/datasets_summarization/processed_data/test_sum_program.json'

# Write the mapped data to a .json file
with open(output_json_path, 'w') as json_file:
    json.dump(mapped_data, json_file, indent=4)