In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
reg_exp = "\\b[A-Za-z_][A-Za-z0-9_]*\s+[A-Za-z_][A-Za-z0-9_]*\s*\([^;{]*\)\s*{"

In [3]:
def extract_context(code):
    match = re.search(reg_exp, code)
    if match:
        return code[: match.end()]
    return ""

In [4]:
def build_context(df):
    print(f"Number of rows in the dataset: {len(df)}")
    # Clean code to remove any NEW_LINE strings
    df["code_cleaned"] = df["code"].str.replace("NEW_LINE", "\n")
    # Finding all functions signature in the code
    df["functions"] = df["code_cleaned"].apply(lambda x: re.findall(reg_exp, x))
    # Retaining only rows with number of functions = 2 and main is not the first function in the code
    df_first_func = df[df["functions"].apply(lambda x: len(x) == 2 and "main" not in x[0])].copy()
    # extracting code_context as code until the first function signature which we want the model to complete along with driver code.
    df_first_func["code_context"] = df_first_func["code_cleaned"].apply(
        lambda code: extract_context(code)
    )

    print(f"Number of processed rows in the final dataset: {len(df_first_func)}")
    return df_first_func

In [6]:
train_df = pd.read_csv("./data/c++/train.csv")
train_df["code_cleaned"] = train_df["code"].str.replace("NEW_LINE", "\n")
train_df

Unnamed: 0,id,text,code,question,code_cleaned
0,0,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> \n using namespace st...
1,1,Check if a number can be represented as sum of...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if a number can be represented as sum of...,#include <bits/stdc++.h> \n using namespace st...
2,2,Generate an N | C ++ program for the above app...,#include <bits/stdc++.h> NEW_LINE using namesp...,Generate an N.,#include <bits/stdc++.h> \n using namespace st...
3,3,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> \n using namespace st...
4,4,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> \n using namespace st...
...,...,...,...,...,...
9792,9792,Count of ways to traverse a Matrix and return ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of ways to traverse a Matrix and return ...,#include <bits/stdc++.h> \n using namespace st...
9793,9793,Count of subsequences of length atmost K conta...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of subsequences of length atmost K conta...,#include <bits/stdc++.h> \n using namespace st...
9794,9794,Minimize prize count required such that smalle...,#include <bits/stdc++.h> NEW_LINE using namesp...,Minimize prize count required such that smalle...,#include <bits/stdc++.h> \n using namespace st...
9795,9795,Number of ways to split N as sum of K numbers ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Number of ways to split N as sum of K numbers ...,#include <bits/stdc++.h> \n using namespace st...


In [14]:
for i in train_df["code_cleaned"][:10].tolist():
    print(i)

#include <bits/stdc++.h> 
 using namespace std ; int maxPresum ( vector < int > a , vector < int > b ) { int X = max ( a [ 0 ] , 0 ) ; for ( int i = 1 ; i < a . size ( ) ; i ++ ) { a [ i ] += a [ i - 1 ] ; X = max ( X , a [ i ] ) ; } int Y = max ( b [ 0 ] , 0 ) ; for ( int i = 1 ; i < b . size ( ) ; i ++ ) { b [ i ] += b [ i - 1 ] ; Y = max ( Y , b [ i ] ) ; } return X + Y ; } int main ( ) { vector < int > A = { 2 , -1 , 4 , -5 } ; vector < int > B = { 4 , -3 , 12 , 4 , -3 } ; cout << maxPresum ( A , B ) << endl ; }
#include <bits/stdc++.h> 
 using namespace std ; bool sumOfTwoCubes ( int n ) { long long int lo = 1 , hi = ( long long int ) cbrt ( n ) ; while ( lo <= hi ) { long long int curr = ( lo * lo * lo + hi * hi * hi ) ; if ( curr == n ) return true ; if ( curr < n ) lo ++ ; else hi -- ; } return false ; } int main ( ) { int N = 28 ; if ( sumOfTwoCubes ( N ) ) { cout << " True " ; } else { cout << " False " ; } return 0 ; }
#include <bits/stdc++.h> 
 using namespace std ; int sie

In [9]:
train_df = pd.read_csv("./data/c++/train.csv")
train_df_context = build_context(train_df)
train_df_context.head()

Number of rows in the dataset: 9797
Number of processed rows in the final dataset: 6269


Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> \n using namespace st...,"[int maxPresum ( vector < int > a , vector < i...",#include <bits/stdc++.h> \n using namespace st...
1,1,Check if a number can be represented as sum of...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if a number can be represented as sum of...,#include <bits/stdc++.h> \n using namespace st...,"[bool sumOfTwoCubes ( int n ) {, int main ( ) {]",#include <bits/stdc++.h> \n using namespace st...
3,3,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> \n using namespace st...,"[long findNthNumber ( long long N ) {, int mai...",#include <bits/stdc++.h> \n using namespace st...
4,4,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> \n using namespace st...,"[int check ( int A , int B ) {, int main ( ) {]",#include <bits/stdc++.h> \n using namespace st...
5,5,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h> \n using namespace st...,"[void sameProductQuadruples ( int nums [ ] , i...",#include <bits/stdc++.h> \n using namespace st...


In [11]:
train_df_context["code_context"][0]

'#include <bits/stdc++.h> \n using namespace std ; int maxPresum ( vector < int > a , vector < int > b ) {'

In [6]:
val_df = pd.read_csv("./data/c++/val.csv")
val_df_context = build_context(val_df)
val_df_context.head()

Number of rows in the dataset: 492
Number of processed rows in the final dataset: 345


Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Program to convert Centimeters to Pixels | C +...,#include <bits/stdc++.h> NEW_LINE using namesp...,Program to convert Centimeters to Pixels.,#include <bits/stdc++.h> \n using namespace st...,"[void Conversion ( double centi ) {, int main ...",#include <bits/stdc++.h> \n using namespace st...
1,1,Kth array element after M replacements of arra...,#include <bits/stdc++.h> NEW_LINE using namesp...,Kth array element after M replacements of arra...,#include <bits/stdc++.h> \n using namespace st...,"[int xor_operations ( int N , int arr [ ] , in...",#include <bits/stdc++.h> \n using namespace st...
2,2,Check if N can be divided into K consecutive e...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if N can be divided into K consecutive e...,#include <bits/stdc++.h> \n using namespace st...,"[void canBreakN ( long long n ) {, int main ( ...",#include <bits/stdc++.h> \n using namespace st...
3,3,Coprime divisors of a number | C ++ program to...,#include <bits/stdc++.h> NEW_LINE using namesp...,Coprime divisors of a number.,#include <bits/stdc++.h> \n using namespace st...,"[void findCoprimePair ( int N ) {, int main ( ...",#include <bits/stdc++.h> \n using namespace st...
6,6,Hexanacci Numbers | C ++ implementation to pri...,#include <bits/stdc++.h> NEW_LINE using namesp...,Hexanacci Numbers.,#include <bits/stdc++.h> \n using namespace st...,"[void printhexa ( int n ) {, int main ( ) {]",#include <bits/stdc++.h> \n using namespace st...


In [7]:
test_df = pd.read_csv("./data/c++/test.csv")
test_df_context = build_context(test_df)
test_df_context.head()

Number of rows in the dataset: 909
Number of processed rows in the final dataset: 613


Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Minimum sum possible by removing all occurrenc...,#include <bits/stdc++.h> NEW_LINE using namesp...,Minimum sum possible by removing all occurrenc...,#include <bits/stdc++.h> \n using namespace st...,"[int minSum ( int A [ ] , int N ) {, int main ...",#include <bits/stdc++.h> \n using namespace st...
1,1,Maximum difference between a pair of adjacent ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Maximum difference between a pair of adjacent ...,#include <bits/stdc++.h> \n using namespace st...,"[void maxAdjacent ( int * arr , int N ) {, int...",#include <bits/stdc++.h> \n using namespace st...
6,6,Check if it is possible to split given Array i...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if it is possible to split given Array i...,#include <bits/stdc++.h> \n using namespace st...,"[bool checkArray ( int n , int k , int arr [ ]...",#include <bits/stdc++.h> \n using namespace st...
7,7,Sum of division of the possible pairs for the ...,#include <bits/stdc++.h> NEW_LINE #define ll ...,Sum of division of the possible pairs for the ...,#include <bits/stdc++.h> \n #define ll long l...,"[int func ( int arr [ ] , int n ) {, int main ...",#include <bits/stdc++.h> \n #define ll long l...
8,8,Count of elements to be inserted to make Array...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of elements to be inserted to make Array...,#include <bits/stdc++.h> \n using namespace st...,"[void insert_element ( int a [ ] , int n ) {, ...",#include <bits/stdc++.h> \n using namespace st...


In [11]:
def format_cpp_code(minified_code):
    # Replace semicolons with semicolon + newline
    formatted_code = re.sub(r";", ";\n", minified_code)
    # Replace opening braces with brace + newline
    formatted_code = re.sub(r"{", "{\n", formatted_code)
    return formatted_code

In [12]:
train_df_context["code_context"] = train_df_context["code_context"].apply(
    lambda code: format_cpp_code(code)
)
test_df_context["code_context"] = test_df_context["code_context"].apply(
    lambda code: format_cpp_code(code)
)
val_df_context["code_context"] = val_df_context["code_context"].apply(
    lambda code: format_cpp_code(code)
)

In [13]:
train_df_context.head()

Unnamed: 0,id,text,code,question,code_cleaned,functions,code_context
0,0,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h> \n using namespace st...,"[int maxPresum ( vector < int > a , vector < i...",#include <bits/stdc++.h> \n using namespace st...
1,1,Check if a number can be represented as sum of...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if a number can be represented as sum of...,#include <bits/stdc++.h> \n using namespace st...,"[bool sumOfTwoCubes ( int n ) {, int main ( ) {]",#include <bits/stdc++.h> \n using namespace st...
3,3,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Nth natural number after removing all numbers ...,#include <bits/stdc++.h> \n using namespace st...,"[long findNthNumber ( long long N ) {, int mai...",#include <bits/stdc++.h> \n using namespace st...
4,4,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> NEW_LINE using namesp...,Check if an integer is rotation of another giv...,#include <bits/stdc++.h> \n using namespace st...,"[int check ( int A , int B ) {, int main ( ) {]",#include <bits/stdc++.h> \n using namespace st...
5,5,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h> NEW_LINE using namesp...,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h> \n using namespace st...,"[void sameProductQuadruples ( int nums [ ] , i...",#include <bits/stdc++.h> \n using namespace st...


In [14]:
train_df_context.loc[0]["code_context"]

'#include <bits/stdc++.h> \n using namespace std ;\n int maxPresum ( vector < int > a , vector < int > b ) {\n'

In [15]:
train_df_context.to_csv("data/c++/train_processed.csv", index=False)
val_df_context.to_csv("data/c++/val_processed.csv", index=False)
test_df_context.to_csv("data/c++/test_processed.csv", index=False)