In [2]:
import re
import numpy as np
import pandas as pd

In [39]:
reg_exp = "\\b[A-Za-z_][A-Za-z0-9_]*\s+[A-Za-z_][A-Za-z0-9_]*\s*\([^;{]*\)\s*{"

In [35]:
def extract_context(code):
    match = re.search(reg_exp, code)
    if match:
        return code[:match.end()]
    return ""

In [50]:
def build_context(df):
    print(f"Number of rows in the dataset: {len(df)}")
    # Clean code to remove any NEW_LINE strings
    df['code_cleaned'] = df['code'].str.replace('NEW_LINE', ' ')
    # Finding all functions signature in the code
    df['functions'] = df['code_cleaned'].apply(lambda x: re.findall(reg_exp, x))
    # Retaining only rows with number of functions = 2 and main is not the first function in the code
    df_first_func = df[df['functions'].apply(lambda x: len(x) == 2 and 'main' not in x[0])].copy()
    # extracting code_context as code until the first function signature which we want the model to complete along with driver code.
    df_first_func['code_context'] = df_first_func['code_cleaned'].apply(lambda code: extract_context(code))

    print(f"Number of processed rows in the final dataset: {len(df_first_func)}")
    return df_first_func

In [56]:
train_df = pd.read_csv('./data/train.csv')
train_df_context = build_context(train_df)
train_df_context.head()

Number of rows in the dataset: 9797
Number of processed rows in the final dataset: 6269


Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> using namespace std...,"[int maxPresum ( vector < int > a , vector < i...",#include <bits/stdc++.h> using namespace std...
1,1,Check if a number can be represented as sum of...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if a number can be represented as sum of...,#include <bits/stdc++.h> using namespace std...,"[bool sumOfTwoCubes ( int n ) {, int main ( ) {]",#include <bits/stdc++.h> using namespace std...
3,3,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> using namespace std...,"[long findNthNumber ( long long N ) {, int mai...",#include <bits/stdc++.h> using namespace std...
4,4,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> using namespace std...,"[int check ( int A , int B ) {, int main ( ) {]",#include <bits/stdc++.h> using namespace std...
5,5,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h> using namespace std...,"[void sameProductQuadruples ( int nums [ ] , i...",#include <bits/stdc++.h> using namespace std...


In [57]:
val_df = pd.read_csv('./data/val.csv')
val_df_context = build_context(val_df)
val_df_context.head()

Number of rows in the dataset: 492
Number of processed rows in the final dataset: 345


Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Program to convert Centimeters to Pixels | C +...,#include <bits/stdc++.h> NEW_LINE using namesp...,Program to convert Centimeters to Pixels.,#include <bits/stdc++.h> using namespace std...,"[void Conversion ( double centi ) {, int main ...",#include <bits/stdc++.h> using namespace std...
1,1,Kth array element after M replacements of arra...,#include <bits/stdc++.h> NEW_LINE using namesp...,Kth array element after M replacements of arra...,#include <bits/stdc++.h> using namespace std...,"[int xor_operations ( int N , int arr [ ] , in...",#include <bits/stdc++.h> using namespace std...
2,2,Check if N can be divided into K consecutive e...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if N can be divided into K consecutive e...,#include <bits/stdc++.h> using namespace std...,"[void canBreakN ( long long n ) {, int main ( ...",#include <bits/stdc++.h> using namespace std...
3,3,Coprime divisors of a number | C ++ program to...,#include <bits/stdc++.h> NEW_LINE using namesp...,Coprime divisors of a number.,#include <bits/stdc++.h> using namespace std...,"[void findCoprimePair ( int N ) {, int main ( ...",#include <bits/stdc++.h> using namespace std...
6,6,Hexanacci Numbers | C ++ implementation to pri...,#include <bits/stdc++.h> NEW_LINE using namesp...,Hexanacci Numbers.,#include <bits/stdc++.h> using namespace std...,"[void printhexa ( int n ) {, int main ( ) {]",#include <bits/stdc++.h> using namespace std...


In [58]:
test_df = pd.read_csv('./data/test.csv')
test_df_context = build_context(test_df)
test_df_context.head()

Number of rows in the dataset: 909
Number of processed rows in the final dataset: 613


Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Minimum sum possible by removing all occurrenc...,#include <bits/stdc++.h> NEW_LINE using namesp...,Minimum sum possible by removing all occurrenc...,#include <bits/stdc++.h> using namespace std...,"[int minSum ( int A [ ] , int N ) {, int main ...",#include <bits/stdc++.h> using namespace std...
1,1,Maximum difference between a pair of adjacent ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Maximum difference between a pair of adjacent ...,#include <bits/stdc++.h> using namespace std...,"[void maxAdjacent ( int * arr , int N ) {, int...",#include <bits/stdc++.h> using namespace std...
6,6,Check if it is possible to split given Array i...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if it is possible to split given Array i...,#include <bits/stdc++.h> using namespace std...,"[bool checkArray ( int n , int k , int arr [ ]...",#include <bits/stdc++.h> using namespace std...
7,7,Sum of division of the possible pairs for the ...,#include <bits/stdc++.h> NEW_LINE #define ll ...,Sum of division of the possible pairs for the ...,#include <bits/stdc++.h> #define ll long lo...,"[int func ( int arr [ ] , int n ) {, int main ...",#include <bits/stdc++.h> #define ll long lo...
8,8,Count of elements to be inserted to make Array...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of elements to be inserted to make Array...,#include <bits/stdc++.h> using namespace std...,"[void insert_element ( int a [ ] , int n ) {, ...",#include <bits/stdc++.h> using namespace std...


In [59]:
train_df_context.to_csv('data/train_processed.csv', index=False)
val_df_context.to_csv('data/val_processed.csv', index=False)
test_df_context.to_csv('data/test_processed.csv', index=False)