In [1]:
import re
import numpy as np
import pandas as pd

In [73]:
reg_exp = "def\s+[\w]+\s*\(.*?\):"

In [74]:
def extract_context(code):
    match = re.search(reg_exp, code)
    if match:
        return code[:match.end()]
    return ""

In [75]:
re.findall(reg_exp, 'def maxPresum(a, b):\n    X = max(a[0], 0)\n    for i in range(1, len(a)):\n        a[i] += a[i - 1]\n        X = max(X, a[i])\n    Y = max(b[0], 0)\n    for i in range(1, len(b)):\n        b[i] += b[i - 1]\n        Y = max(Y, b[i])\n    return X + Y\n\n\nA = [2, -1, 4, -5]\nB = [4, -3, 12, 4, -3]\nprint(maxPresum(A, B))\n')

['def maxPresum(a, b):']

In [78]:
def build_context(df):
    print(f"Number of rows in the dataset: {len(df)}")
    # Finding all functions signature in the code
    df['functions'] = df['code'].apply(lambda x: re.findall(reg_exp, x))
    # Retaining only rows with number of functions = 2 and main is not the first function in the code
    df_first_func = df[df['functions'].apply(lambda x: len(x) >= 1 and 'main' not in x[0])].copy()
    # extracting code_context as code until the first function signature which we want the model to complete along with driver code.
    df_first_func['code_context'] = df_first_func['code'].apply(lambda code: extract_context(code))

    print(f"Number of processed rows in the final dataset: {len(df_first_func)}")
    return df_first_func

In [79]:
train_df = pd.read_csv('./data/python/train.csv')
train_df_context = build_context(train_df)
train_df_context.head()

Number of rows in the dataset: 9262
Number of processed rows in the final dataset: 9136


Unnamed: 0,id,text,code,question,functions,code_context
0,0,Maximum Prefix Sum possible by merging two giv...,"def maxPresum(a, b):\n X = max(a[0], 0)\n ...",Maximum Prefix Sum possible by merging two giv...,"[def maxPresum(a, b):]","def maxPresum(a, b):"
1,1,Check if a number can be represented as sum of...,import math\n\n\ndef sumOfTwoCubes(n):\n lo...,Check if a number can be represented as sum of...,[def sumOfTwoCubes(n):],import math\n\n\ndef sumOfTwoCubes(n):
2,2,Generate an N | Python3 program for the above ...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,Generate an N.,"[def sieveOfPrimes():, def getArray(arr, N):]",sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...
3,3,Nth natural number after removing all numbers ...,def findNthNumber(N):\n result = 0\n p =...,Nth natural number after removing all numbers ...,[def findNthNumber(N):],def findNthNumber(N):
4,4,Check if an integer is rotation of another giv...,"import math\n\n\ndef check(A, B):\n if (A =...",Check if an integer is rotation of another giv...,"[def check(A, B):]","import math\n\n\ndef check(A, B):"


In [80]:
train_df.head()

Unnamed: 0,id,text,code,question,functions
0,0,Maximum Prefix Sum possible by merging two giv...,"def maxPresum(a, b):\n X = max(a[0], 0)\n ...",Maximum Prefix Sum possible by merging two giv...,"[def maxPresum(a, b):]"
1,1,Check if a number can be represented as sum of...,import math\n\n\ndef sumOfTwoCubes(n):\n lo...,Check if a number can be represented as sum of...,[def sumOfTwoCubes(n):]
2,2,Generate an N | Python3 program for the above ...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,Generate an N.,"[def sieveOfPrimes():, def getArray(arr, N):]"
3,3,Nth natural number after removing all numbers ...,def findNthNumber(N):\n result = 0\n p =...,Nth natural number after removing all numbers ...,[def findNthNumber(N):]
4,4,Check if an integer is rotation of another giv...,"import math\n\n\ndef check(A, B):\n if (A =...",Check if an integer is rotation of another giv...,"[def check(A, B):]"


In [81]:
train_df.loc[0]['code']

'def maxPresum(a, b):\n    X = max(a[0], 0)\n    for i in range(1, len(a)):\n        a[i] += a[i - 1]\n        X = max(X, a[i])\n    Y = max(b[0], 0)\n    for i in range(1, len(b)):\n        b[i] += b[i - 1]\n        Y = max(Y, b[i])\n    return X + Y\n\n\nA = [2, -1, 4, -5]\nB = [4, -3, 12, 4, -3]\nprint(maxPresum(A, B))\n'

In [82]:
val_df = pd.read_csv('./data/val.csv')
val_df_context = build_context(val_df)
val_df_context.head()

Number of rows in the dataset: 492
Number of processed rows in the final dataset: 0


Unnamed: 0,id,text,code,question,functions,code_context


In [83]:
test_df = pd.read_csv('./data/test.csv')
test_df_context = build_context(test_df)
test_df_context.head()

Number of rows in the dataset: 909
Number of processed rows in the final dataset: 0


Unnamed: 0,id,text,code,question,functions,code_context


In [84]:
train_df_context.to_csv('data/python/train_processed.csv', index=False)
val_df_context.to_csv('data/python/val_processed.csv', index=False)
test_df_context.to_csv('data/python/test_processed.csv', index=False)