In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'


from transformers import AutoTokenizer
import transformers
import torch

model = "codellama/CodeLlama-7b-Instruct-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)




  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|█████████████████████████| 2/2 [02:12<00:00, 66.31s/it]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:17<00:00,  8.96s/it]


In [2]:
input_text = """
### System Prompt
You are an intelligent programming assistant.

### User Message
Implement a linked list in C++

### Assistant
"""

# 입력 텍스트의 토큰 수 계산
input_tokens = tokenizer.encode(input_text, return_tensors="pt")
input_length = len(input_tokens[0])

# 원하는 출력 토큰 수
desired_output_length = 10

# 전체 max_length 설정 (입력 + 출력)
total_max_length = input_length + desired_output_length

print("pipeline start!!!")

# 파이프라인 설정 (예시에서는 나머지 매개변수를 유지)
sequences = pipeline(
    input_text,
    do_sample=True,
    top_k=3,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    max_length=total_max_length,  # 수정된 부분
)

# 결과 출력
for seq in sequences:
    # 모델 출력에서 입력 텍스트 제거
    result_text = seq['generated_text'][len(input_text):]
    print(f"Result: {result_text}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


pipeline start!!!
Result: I have implemented a linked list in C++.



In [3]:
import pandas as pd
import re
from tqdm import tqdm

def remove_comments(cpp_code):
        # 멀티라인 주석 제거
        code = re.sub(r'/\*.*?\*/', '', cpp_code, flags=re.DOTALL)
        # 단일 라인 주석 제거
        code = re.sub(r'//.*', '', code)
        
        # 문자열 내용 제거 (" " 안의 내용과 ' ' 안의 내용)
        code = re.sub(r'"(.*?)"', '""', code)
        code = re.sub(r"'(.*?)'", "''", code)
        # 빈 줄 제거
        code = re.sub(r'\n\s*\n', '\n', code)
        # 불필요한 공백 및 탭 변환 (연속된 공백을 하나의 공백으로)
        code = re.sub(r'\s+', ' ', code)
        # 문자열 앞뒤 공백 제거
        cleaned_code = code.strip()
        
        return cleaned_code

In [4]:
sample_data = pd.read_csv("./bigdata/sample_train.csv")
sample_data.head()

Unnamed: 0,code1_path,code2_path,code1,code2,similar
0,./train_code/problem393/problem393_19.cpp,./train_code/problem033/problem033_439.cpp,#include <bits/stdc++.h>\n\nusing namespace st...,#include <algorithm>\n#include <bitset>\n#incl...,0
1,./train_code/problem019/problem019_210.cpp,./train_code/problem019/problem019_63.cpp,#include <iostream>\n\nusing namespace std;\n\...,#include <iostream>\n#include <string>\nusing ...,1
2,./train_code/problem107/problem107_486.cpp,./train_code/problem107/problem107_340.cpp,#include <iostream>\n#include <vector>\nusing ...,#include <cstdio>\n#include <cstdlib>\n#includ...,1
3,./train_code/problem187/problem187_257.cpp,./train_code/problem403/problem403_135.cpp,#include <bits/stdc++.h>\n#include <unordered_...,#include <bits/stdc++.h>\nusing namespace std;...,0
4,./train_code/problem173/problem173_490.cpp,./train_code/problem173/problem173_345.cpp,#include <bits/stdc++.h>\ntypedef long long ll...,"#include ""bits/stdc++.h""\n#define rep(i,n) for...",1


In [5]:
code1_example_1 = remove_comments(sample_data.loc[1, 'code1'])

In [6]:
code2_example_1 = remove_comments(sample_data.loc[1, 'code2'])

In [7]:
code1_example_2 = remove_comments(sample_data.loc[0, 'code1'])

In [8]:
code2_example_2 = remove_comments(sample_data.loc[0, 'code2'])

In [9]:
code1_example_3 = remove_comments(sample_data.loc[2, 'code1'])

In [10]:
code2_example_3 = remove_comments(sample_data.loc[2, 'code2'])

In [12]:
TODAY = "2024_03_08"

In [None]:
with open(f'./bigdata/llama2/{TODAY}_b4bert_result_part3.txt', 'w') as file:
    pass


# tokenizer와 model은 미리 정의되어 있어야 합니다.
# device는 'cuda' 또는 'cpu'일 수 있습니다.

def predict(model, tokenizer, test_data):
    predictions = []
    
    
    for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
            
        # inputs1 = tokenizer(row['code1'], return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
        # inputs2 = tokenizer(row['code2'], return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
        inputtexts = f"""
<s>[INST] <<SYS>>\\nYou are an intelligent programming assistant capable of understanding and analyzing C++ code. Your task is to determine if two given pieces of code solve the same problem. Respond with "Yes" if they solve the same problem and "No" otherwise.\\n<</SYS>>\\n\\nFirst code: \n{remove_comments(row['code1'])}
Second code: \n{remove_comments(row['code2'])}\nDo these codes solve the same problem? Say yes or no.[/INST]
"""            
        
        # 입력 텍스트의 토큰 수 계산
        input_tokens = tokenizer.encode(inputtexts, return_tensors="pt")
        input_length = len(input_tokens[0])

        # 원하는 출력 토큰 수
        desired_output_length = 5

        # 전체 max_length 설정 (입력 + 출력)
        total_max_length = input_length + desired_output_length

        # 파이프라인 설정 (예시에서는 나머지 매개변수를 유지)
        sequences = pipeline(
            inputtexts,
            do_sample=False,
            #top_k=1,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            max_length=total_max_length,  # 수정된 부분
        )
        for seq in sequences:
            # 모델 출력에서 입력 텍스트 제거
            result_text = seq['generated_text'][len(inputtexts):]
            #result_text = seq['generated_text']
            predictions.append(result_text)
            with open(f'./bigdata/llama2/{TODAY}_b4bert_result_part3.txt', 'a') as file:
                file.write(f'{index} : {result_text}\n')
    return predictions

# 예제 사용
test_data = pd.read_csv("./bigdata/test_part3.csv")
# 모델과 tokenizer가 정의되어 있어야 합니다.
predictions = predict(model, tokenizer, test_data)

# 결과를 제출 파일로 저장
submission = pd.read_csv('./bigdata/sample_submission_part3.csv')
submission['similar'] = predictions
submission.to_csv(f'./bigdata/llama2/{TODAY}_predictions_b4bert_submit_part3.csv', index=False)

  0%|                                  | 79/595000 [14:50<1869:29:22, 11.31s/it]

In [None]:
f"""
### System Prompt
You are an intelligent programming assistant capable of understanding and analyzing C++ code. Your task is to determine if two given pieces of code solve the same problem. Respond with "Yes" if they solve the same problem and "No" otherwise.

### Example 1
First code: {code1_example_1}
Second code: {code2_example_1}

Do these codes solve the same problem?
Assistant: Yes

### Example 2
First code: {code1_example_2}
Second code: {code2_example_2}

Do these codes solve the same problem?
Assistant: No

### Example 3
First code: {code1_example_3}
Second code: {code2_example_3}

Do these codes solve the same problem?
Assistant: Yes

### User Message
First code: {remove_comments(row['code1'])}
Second code: {remove_comments(row['code2'])}

Do these codes solve the same problem?
### Assistant
        """