In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
reg_exp = "\\b[A-Za-z_][A-Za-z0-9_]*\s+[A-Za-z_][A-Za-z0-9_]*\s*\([^;{]*\)\s*{"

In [3]:
def extract_context(code):
    match = re.search(reg_exp, code)
    if match:
        return code[:match.end()]
    return ""

In [4]:
def build_context(df):
    print(f"Number of rows in the dataset: {len(df)}")
    # Finding all functions signature in the code
    df['functions'] = df['code'].apply(lambda x: re.findall(reg_exp, x))
    # Retaining only rows with number of functions = 2 and main is not the first function in the code
    df_first_func = df[df['functions'].apply(lambda x: len(x) == 2 and 'main' not in x[0])].copy()
    # extracting code_context as code until the first function signature which we want the model to complete along with driver code.
    df_first_func['code_context'] = df_first_func['code'].apply(lambda code: extract_context(code))

    print(f"Number of processed rows in the final dataset: {len(df_first_func)}")
    return df_first_func

In [5]:
train_df = pd.read_csv('../data/c++/train_formatted.csv')
train_df_context = build_context(train_df)
train_df_context.head()

Number of rows in the dataset: 9797
Number of processed rows in the final dataset: 6263


Unnamed: 0,id,text,code,question,functions,code_context
0,0,Maximum Prefix Sum possible by merging two giv...,#include <bits/stdc++.h>\nusing namespace std;...,Maximum Prefix Sum possible by merging two giv...,"[int maxPresum(vector<int> a, vector<int> b) {...",#include <bits/stdc++.h>\nusing namespace std;...
1,1,Check if a number can be represented as sum of...,#include <bits/stdc++.h>\nusing namespace std;...,Check if a number can be represented as sum of...,"[bool sumOfTwoCubes(int n) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
3,3,Nth natural number after removing all numbers ...,#include <bits/stdc++.h>\nusing namespace std;...,Nth natural number after removing all numbers ...,"[long findNthNumber(long long N) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
4,4,Check if an integer is rotation of another giv...,#include <bits/stdc++.h>\nusing namespace std;...,Check if an integer is rotation of another giv...,"[int check(int A, int B) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
5,5,Count of quadruples with product of a pair equ...,#include <bits/stdc++.h>\nusing namespace std;...,Count of quadruples with product of a pair equ...,"[void sameProductQuadruples(int nums[], int N)...",#include <bits/stdc++.h>\nusing namespace std;...


In [6]:
val_df = pd.read_csv('../data/c++/val_formatted.csv')
val_df_context = build_context(val_df)
val_df_context.head()

Number of rows in the dataset: 492
Number of processed rows in the final dataset: 345


Unnamed: 0,id,text,code,question,functions,code_context
0,0,Program to convert Centimeters to Pixels | C +...,#include <bits/stdc++.h>\nusing namespace std;...,Program to convert Centimeters to Pixels.,"[void Conversion(double centi) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
1,1,Kth array element after M replacements of arra...,#include <bits/stdc++.h>\nusing namespace std;...,Kth array element after M replacements of arra...,"[int xor_operations(int N, int arr[], int M, i...",#include <bits/stdc++.h>\nusing namespace std;...
2,2,Check if N can be divided into K consecutive e...,#include <bits/stdc++.h>\nusing namespace std;...,Check if N can be divided into K consecutive e...,"[void canBreakN(long long n) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
3,3,Coprime divisors of a number | C ++ program to...,#include <bits/stdc++.h>\nusing namespace std;...,Coprime divisors of a number.,"[void findCoprimePair(int N) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
6,6,Hexanacci Numbers | C ++ implementation to pri...,#include <bits/stdc++.h>\nusing namespace std;...,Hexanacci Numbers.,"[void printhexa(int n) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...


In [7]:
test_df = pd.read_csv('../data/c++/test_formatted.csv')
test_df_context = build_context(test_df)
test_df_context.head()

Number of rows in the dataset: 909
Number of processed rows in the final dataset: 614


Unnamed: 0,id,text,code,question,functions,code_context
0,0,Minimum sum possible by removing all occurrenc...,#include <bits/stdc++.h>\nusing namespace std;...,Minimum sum possible by removing all occurrenc...,"[int minSum(int A[], int N) {, int main() {]",#include <bits/stdc++.h>\nusing namespace std;...
1,1,Maximum difference between a pair of adjacent ...,#include <bits/stdc++.h>\nusing namespace std;...,Maximum difference between a pair of adjacent ...,"[void maxAdjacent(int *arr, int N) {, int main...",#include <bits/stdc++.h>\nusing namespace std;...
6,6,Check if it is possible to split given Array i...,#include <bits/stdc++.h>\nusing namespace std;...,Check if it is possible to split given Array i...,"[bool checkArray(int n, int k, int arr[]) {, i...",#include <bits/stdc++.h>\nusing namespace std;...
7,7,Sum of division of the possible pairs for the ...,#include <bits/stdc++.h>\n#definelllonglong\nu...,Sum of division of the possible pairs for the ...,"[int func(int arr[], int n) {, int main() {]",#include <bits/stdc++.h>\n#definelllonglong\nu...
8,8,Count of elements to be inserted to make Array...,#include <bits/stdc++.h>\nusing namespace std;...,Count of elements to be inserted to make Array...,"[void insert_element(int a[], int n) {, int ma...",#include <bits/stdc++.h>\nusing namespace std;...


In [11]:
train_df_context = train_df_context.reset_index(drop=True)
val_df_context = val_df_context.reset_index(drop=True)
test_df_context = test_df_context.reset_index(drop=True)

In [17]:
print(train_df_context.loc[3]['code_context'])

#include <bits/stdc++.h>
using namespace std;
int check(int A, int B) {


In [18]:
train_df_context.to_csv('../data/c++/train_formattted_processed.csv', index=False)
val_df_context.to_csv('../data/c++/val_formattted_processed.csv', index=False)
test_df_context.to_csv('../data/c++/test_formattted_processed.csv', index=False)