In [1]:
import os
import json

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
with open('config.json') as f:
    config = json.load(f)

model_path = config['model_path']

#### 1. 코드 생성 모델

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def code_generator(prompt, model_id, model_path):
    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        cache_dir=model_path,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        cache_dir=model_path,
        torch_dtype=torch.float16,
        device_map='auto',
    )

    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")

    generated = model.generate(
    input_ids,
    max_new_tokens=200,
    )

    output = tokenizer.decode(
        generated[0][input_ids.shape[1]:], 
        skip_special_tokens=True
        )
    return output


In [3]:
file = open('/opt/sample_prompt/prompt1/input/code.txt')
code = file.read()
model_id = "codellama/CodeLlama-7b-Python-hf"

output = code_generator(
    prompt=code, 
    model_id=model_id, 
    model_path=model_path,
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [4]:
print(output)


"""

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../input/credit-card-approval-prediction/creditcard.csv")

# 1. All code is written in Python.
# 2. Write code to save the graph in the 'png' file in the 'test_graph' folder at the current location.
# 3. Write code to save the description of the graph with a brief explanation. Save the file in the 'txt' format in the 'test_text' folder at the current location.
# 4. There is no need to create a dataframe.
# 5. The data is loaded and the dataframe name is df.

# 1. All code is written in Python.
# 2. Write code to save the graph in the 'png' file in the 'test_graph' folder at


#### 2. 한·영 번역

In [4]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop):])).item():
                return True

        return False

stop_words_ids = torch.tensor([[829, 45107, 29958], [1533, 45107, 29958], [829, 45107, 29958], [21106, 45107, 29958]]).to("cuda")
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def translation_ko2en(lan, prompt, model_id, model_path, stopping_criteria):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map='auto',
        cache_dir=model_path,
        )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        cache_dir=model_path,
        )
    
    if (lan == 'ko'):
        prompt_formatted = f"### 한국어: {prompt}</끝>\n### 영어:"
    elif (lan == 'en'):
        prompt_formatted = f"### 영어: {prompt}</끝>\n### 한국어:"
        
    generated = model.generate(
        **tokenizer(
            prompt_formatted,
            return_tensors='pt',
            return_token_type_ids=False
            ).to('cuda'),
            max_new_tokens=2000,
            temperature=0.3,
            num_beams=5,
            stopping_criteria=stopping_criteria,
            )
    
    output = tokenizer.decode(generated[0][1:]).replace(prompt_formatted+" ", "").replace("</끝>", "")
    return output

In [None]:
file = open('/opt/sample_prompt/prompt1/input/kr-eng.txt')
kr_eng = file.read()
model_id = "squarelike/Gugugo-koen-7B-V1.1" # 양방향 모델, 영어 번역 제한 가능한 것으로 보임

output = translation_ko2en(
    lan="ko", 
    prompt=kr_eng[:2000], 
    model_id=model_id, 
    model_path=model_path, 
    stopping_criteria=stopping_criteria,
    )

In [7]:
print(output)

[Requirements]
Translate the contents from Text Start to Text End in English

[Constraints]

1.Translate only the contents from Text Start to Text End.

2."" enclosed parts keep the original value.

3.The values of the data like "부산 동래구" keep the original value without translation.

4.The description of the data related to the original data keep the original value as it is.

5.Translate the entire contents of the text without shrinking the contents of the translation.

[[Text Start]
[context]
The data is credit card data.
The information about the data is as below.

 #   Column            Dtype  

---  ------            -----

 0   store_id          int64  
 1   card_id           int64  
 2   card_company      object 
 3   transacted_date   object 
 4   transacted_time   object 
 5   installment_term  int64  
 6   region            object 
 7   type_of_business  object 
 8   amount            float64
The additional information of each column is as below.
The date column is in the forma

#### 3. 영·한 번역

In [2]:
from transformers import LlamaForCausalLM, LlamaTokenizer

def translation_en2ko(prompt, model_id, model_path):
    tokenizer = LlamaTokenizer.from_pretrained(
        model_id,
        cache_dir=model_path,
    )

    model = LlamaForCausalLM.from_pretrained(
        model_id,
        cache_dir=model_path,
    )

    tokenizer.pad_token_id = 2
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.eos_token_id = 46332
    tokenizer.add_eos_token = True
    tokenizer.padding_side = 'right'
    tokenizer.model_max_length = 768

    input_text = f"### English: {prompt}\n### 한국어: "

    inputs = tokenizer(
    input_text, 
    return_tensors="pt", 
    max_length=tokenizer.model_max_length, 
    truncation=True
    )

    inputs['input_ids'] = inputs['input_ids'][0][:-1].unsqueeze(dim=0)
    inputs['attention_mask'] = inputs['attention_mask'][0][:-1].unsqueeze(dim=0)

    generated = model.generate(
        **inputs, 
        max_length=tokenizer.model_max_length, 
        eos_token_id=tokenizer.eos_token_id
        )
    input_len = len(inputs['input_ids'].squeeze())
    output = tokenizer.decode(
        generated[0][input_len:], 
        skip_special_tokens=True
        )
    return output

In [3]:
file = open('/opt/sample_prompt/prompt1/input/eng-kr.txt')
eng_kr = file.read()
model_id = "traintogpb/llama-2-en2ko-translator-7b-qlora-bf16-upscaled"

translation_en2ko(prompt=eng_kr, model_id=model_id, model_path=model_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

'[과제] [컨텍스트] 제공된 신용카드 데이터를 바탕으로 부산 동래구의 매출 상위 업종을 나타낸 막대그래프이다. 각 막대는 다른 업종을 나타내며, 막대의 높이는 해당 업종이 발생시킨 총매출액(원화, KRW)을 나타낸다. x축은 업종을 나타내며, y축은 총매출액을 나타낸다.'