In [1]:
!pip install transformers
!pip install numpy pandas
!pip install sentencepiece
!pip install json
!pip install -U bitsandbytes
!pip install -U accelerate


Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m203.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.26.5-py3-none-any.whl (447 kB)
Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tokenization_bitnet import BitnetTokenizer
from modeling_bitnet import BitnetForCausalLM
import pandas as pd
import json

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [14]:
model_name='1bitLLM/bitnet_b1_58-3B'
model = BitnetForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    low_cpu_mem_usage=True,
#     attn_implementation="flash_attention_2",
    torch_dtype=torch.float16,
).half()

tokenizer = BitnetTokenizer.from_pretrained(model_name, use_fast=False)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# model = model.to(device)

In [10]:

# # Bits and Bytes 설정
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # 4-bit Quantization
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",  # Normalized Float 4 (NF4)
#     bnb_4bit_compute_dtype=torch.float16  # Use FP16 for computations
# )

# # 모델 로드 (초기화된 상태에서 로드)
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map="auto"
# ).to(device)

In [6]:
# 파일 경로
file_path = "./dirty.csv"

# CSV 데이터 로드
data = pd.read_csv(file_path)
print("Loaded Data:")
print(data.head())

Loaded Data:
   i_w_blo_weg  o_w_blo_power  o_w_blo_voltage  i_w_bhl_weg  o_w_bhl_power  \
0       -107.0            0.0              0.0          0.0            0.0   
1       -107.0            0.0              0.0          0.0            NaN   
2       -107.0            0.0              0.0          0.0            0.0   
3       -107.0            0.0              0.0          0.0            0.0   
4       -107.0            0.0              0.0          0.0            0.0   

   o_w_bhl_voltage  i_w_bhr_weg  o_w_bhr_power  o_w_bhr_voltage  i_w_bru_weg  \
0              0.0      -1268.0            0.0              0.0        -26.0   
1              0.0      -1268.0            0.0              0.0        -26.0   
2              0.0      -1268.0            0.0              0.0        -26.0   
3              0.0      -1268.0            0.0              0.0         29.0   
4              0.0      -1268.0            0.0              0.0         29.0   

   o_w_bru_power  o_w_bru_voltage  i_

In [30]:
def clean_data_with_llm_in_batches(rows, batch_size=5):
    """
    Cleans rows of data using an LLM with batch processing to handle token length limits.
    """
    # 샘플 데이터 및 간소화된 프롬프트
    examples = [
        {
            "i_w_blo_weg": -9999.0,
            "o_w_blo_power": "nan",
            "o_w_blo_voltage": -10.0,
            "i_w_bhl_weg": 0.0,
            "o_w_bhl_power": 0.0,
            "o_w_bhl_voltage": 0.0,
            "i_w_bhr_weg": -5000.0,
            "o_w_bhr_power": 0.0,
            "o_w_bhr_voltage": 0.0,
            "i_w_bru_weg": -50.0,
            "o_w_bru_power": 200.0,
            "o_w_bru_voltage": 20.0,
            "i_w_hr_weg": 0.0,
            "o_w_hr_power": 9000.0,
            "o_w_hr_voltage": 50.0,
            "i_w_hl_weg": 0.0,
            "o_w_hl_power": None,
            "o_w_hl_voltage": 24.0,
            "labels": 1.0
        }
    ]

    examples_cleaned = [
        {
            "i_w_blo_weg": -107.0,
            "o_w_blo_power": 0.0,
            "o_w_blo_voltage": 0.0,
            "i_w_bhl_weg": 0.0,
            "o_w_bhl_power": 0.0,
            "o_w_bhl_voltage": 0.0,
            "i_w_bhr_weg": -1268.0,
            "o_w_bhr_power": 0.0,
            "o_w_bhr_voltage": 0.0,
            "i_w_bru_weg": -26.0,
            "o_w_bru_power": 100.0,
            "o_w_bru_voltage": 20.0,
            "i_w_hr_weg": 0.0,
            "o_w_hr_power": 7168.0,
            "o_w_hr_voltage": 26.0,
            "i_w_hl_weg": 0.0,
            "o_w_hl_power": 7720.0,
            "o_w_hl_voltage": 24.0,
            "labels": 1.0
        }
    ]

    # 간단한 Few-shot 프롬프트 생성
    example_prompts = f"Example:\nRow: {examples[0]}\nCleaned Row: {examples_cleaned[0]}\n"

    cleaned_rows = []
    prompt = (
        f"Clean the following rows based on the provided example:\n"
        f"{example_prompts}\n"
        f"Rows: {rows}\n"
        f"Provide cleaned rows as a valid JSON array. "
        f"Cleaned rows: "
    )

    # 입력 토큰화
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)

    # 모델 출력 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            do_sample=True,
            max_new_tokens=1024,  # 새로 생성되는 토큰 수 제한
            top_p=0.95       # 상위 확률 토큰만 선택
        )
    # print(f"Input token length: {inputs.input_ids.shape[1]}")
    # 출력 디코딩
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(f"Response from LLM: {response}")  # 디버깅용

        # Cleaned rows 부분 추출
    try:
        # "Cleaned rows:" 이후 내용 추출
        cleaned_rows_part = response.split("JSON array. Cleaned rows:")[1].strip()

        # JSON 배열 형태로 가공
        cleaned_rows_str = cleaned_rows_part.replace("'", '"')  # 단일 따옴표를 이중 따옴표로 변환
        cleaned_rows_str = f"[{cleaned_rows_str}]"  # JSON 배열로 감싸기

        # JSON 파싱
        cleaned_rows = json.loads(cleaned_rows_str)
        print("Parsed JSON:")
        print(cleaned_rows)
    except Exception as e:
        print(f"Error: {e}")
        
    return cleaned_rows

In [None]:
# 첫 번째 행 선택 (실험용)
single_row_to_clean = [data.iloc[1].to_dict()]  # 단일 행을 리스트로 감쌈

# 클리닝 실행
cleaned_rows = clean_data_with_llm_in_batches(single_row_to_clean)
    

# 결과 출력
# print("\nOriginal Row:")
# print(single_row_to_clean)
print("\nCleaned Row:")
print(cleaned_rows)  # LLM 함수의 반환값이 리스트이므로 첫 번째 요소를 출력

In [28]:
print(next(model.parameters()).device)

cuda:0
