In [None]:
!nvidia-smi

Fri Jun 23 05:22:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    43W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
%cd '/content/drive/MyDrive'

/content/drive/MyDrive


In [None]:
# git을 받아온 다음, 필요한 파일들을 설치합니다.
# peft의 경우 그냥 받아오면 에러가 있어서, 먼저 지운후에 peft를 아래 버전으로 다시 설치합니다.
# 또한 학습 후 모델 저장 간 오류가 있을 수 있습니다.
# 오류 방지를 위해 transformer downgrade 설치 및 finetune.py에서 아래 부분을 주석처리해주세요.

'''
    old_state_dict = model.state_dict
    model.state_dict = (
        lambda self, *_, **__: get_peft_model_state_dict(
            self, old_state_dict()
        )
    ).__get__(model, type(model))
'''

# 추가적인 부분은 URL 참고바랍니다. https://github.com/tloen/alpaca-lora/issues/483

!git clone -q https://github.com/tloen/alpaca-lora.git
%cd alpaca-lora
!pip install -r requirements.txt -q
!pip uninstall peft -y -q
!pip install -q git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08
!pip uninstall transformers
!pip install transformers==4.29.2

In [None]:
import numpy as np
import pandas as pd
import operator
import os
import string
import re
import random
import sys
import platform
import math
import time
import datetime
import json

from collections import Counter, defaultdict

from matplotlib import rcParams, pyplot as plt
import seaborn as sns

import sklearn as sk
from sklearn.utils import check_random_state
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import ne_chunk, tree2conlltags
from nltk.probability import FreqDist

nltk.download('stopwords')
eng_stopwords = set(stopwords.words("english"))
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
symbols_knowns = string.ascii_letters + string.digits + string.punctuation

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore')

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [None]:
# instruction 데이터셋 만들기

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Data/test.csv')
print("Number of rows in train dataset : ", train.shape[0])

Number of rows in train dataset :  2478


In [None]:
# train 데이터프레임에서 facts 열의 각 행(row)에 대해 문장 수를 세는 함수를 정의합니다.
def count_sentences(row):
    sentences = sent_tokenize(row)
    return len(sentences)

# facts 열의 문장 수를 세어서 새로운 'num_sentences' 열을 추가합니다.
train['num_sentences'] = train['facts'].apply(lambda x: count_sentences(x))

# 문장 수가 1개인 행(row)들만 선택합니다.
rows_with_single_sentence = train[train['num_sentences'] == 1]

# 결과를 DataFrame으로 출력합니다.
rows_with_single_sentence

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,num_sentences
248,TRAIN_0248,New York,Ferber,A New York child pornography law prohibited pe...,1,1
554,TRAIN_0554,Riverside County,McLaughlin,McLaughlin was arrested without a warrant and ...,1,1
630,TRAIN_0630,Earl R. Deen,"Gulf, Colorado & Santa Fe Railway Company",Not Available.\n,1,1
764,TRAIN_0764,Nollan,California Coastal Commission,The California Coastal Commission required own...,1,1
858,TRAIN_0858,Morrison,Olson,The Ethics in Government Act of 1978 created a...,1,1
1104,TRAIN_1104,Eastern Enterprises,Apfel,Currently unknown.\n,1,1
1285,TRAIN_1285,Smith,Daily Mail Publishing Company,A West Virginia statute made it a crime for a ...,0,1
1319,TRAIN_1319,National Association for the Advancement of Co...,Button,The NAACP was prosecuted for violating a Virgi...,1,1
1329,TRAIN_1329,United States,Paradise,In response to a series of NAACP-initiated law...,0,1
1461,TRAIN_1461,Jones,Alfred H. Mayer Company,"Jones, a black man, charged that a real estate...",1,1


In [None]:
# 특정 인덱스 삭제 - Not available, Currently available, Currently unknown
indexes_to_remove = [630, 1104, 1595]
train = train.drop(indexes_to_remove)
train = train.reset_index(drop=True)

In [None]:
# 'facts' 열에서 'Not available'이나 'Currently available' 문구가 있는지 확인
is_not_available = train['facts'].str.contains('Not available', case=False)
is_currently_available = train['facts'].str.contains('Currently available', case=False)

# 결과 출력 -> 위 문구 외에 다른 문장 있으므로 패스
train[is_not_available | is_currently_available]

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner,num_sentences
86,TRAIN_0086,Brown,Louisiana,The Audubon Regional library operated three br...,1,7
574,TRAIN_0574,Gil Garcetti et al.,Richard Ceballos,"Richard Ceballos, an employee of the Los Angel...",1,6
648,TRAIN_0649,"Friends of the Earth, Inc.","Laidlaw Environmental Services (TOC), Inc.","After Laidlaw Environmental Services, Inc. bou...",1,13
1061,TRAIN_1062,"Kenneth F. Fare, Acting Chief Probation Officer",Michael C.,"Police arrested Michael C., a 16 year old, on ...",1,9
1610,TRAIN_1613,"Atlantic Sounding Co., Inc., et al.",Edgar L. Townsend,"In July 2005, Edgar Townsend was allegedly inj...",0,13
2230,TRAIN_2233,"Verizon Communications, Inc.",Federal Communications Commission,The Telecommunications Act of 1996 entitles ne...,0,5


In [None]:
train['facts'].duplicated().value_counts()

False    2475
Name: facts, dtype: int64

In [None]:
train.drop("ID", axis=1, inplace=True)

In [None]:
# RD

def random_deletion(words, p):

    words = words.split()

    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    sentence = ' '.join(new_words)

    return sentence

In [None]:
# RS

def swap_word(new_words):

    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0

    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1

        if counter > 3:
            return new_words

    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

In [None]:
def random_swap(words, n):

    words = words.split()
    new_words = words.copy()
    # n is the number of words to be swapped
    for _ in range(n):
        new_words = swap_word(new_words)

    sentence = ' '.join(new_words)

    return sentence

In [None]:
# SR

def get_synonyms(word):

    synonyms = set()

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)

    return list(synonyms)

In [None]:
def synonym_replacement(words, n):

    words = words.split()

    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in eng_stopwords]))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)

        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1

        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

In [None]:
# RI

def random_insertion(words, n):

    words = words.split()
    new_words = words.copy()

    for _ in range(n):
        add_word(new_words)

    sentence = ' '.join(new_words)
    return sentence

def add_word(new_words):

    synonyms = []
    counter = 0

    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return

    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

In [None]:
train['facts_RS'] = train['facts'].apply(lambda x: random_swap(x, 5))
train['facts_RD'] = train['facts'].apply(lambda x: random_deletion(x, 0.5))
train['facts_SR'] = train['facts'].apply(lambda x: synonym_replacement(x, 5))
train['facts_RI'] = train['facts'].apply(lambda x: random_insertion(x, 5))
train_1 = train.drop(['facts_RS', 'facts_RD', 'facts_SR', 'facts_RI'], axis=1)
train_2 = train.drop(['facts', 'facts_RD', 'facts_SR', 'facts_RI'], axis=1)
train_3 = train.drop(['facts_RS', 'facts', 'facts_SR', 'facts_RI'], axis=1)
train_4 = train.drop(['facts_RS', 'facts_RD', 'facts', 'facts_RI'], axis=1)
train_5 = train.drop(['facts_RS', 'facts_RD', 'facts_SR', 'facts'], axis=1)
train_2 = train_2.rename(columns={'facts_RS' : 'facts'})
train_3 = train_3.rename(columns={'facts_RD' : 'facts'})
train_4 = train_4.rename(columns={'facts_SR' : 'facts'})
train_5 = train_5.rename(columns={'facts_RI' : 'facts'})
train_extended = pd.concat([train_1, train_2, train_3, train_4, train_5], axis=0)
train_extended.reset_index(drop=True, inplace=True)
train_extended

Unnamed: 0,first_party,second_party,facts,first_party_winner,num_sentences
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1,7
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0,7
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1,8
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0,3
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1,9
...,...,...,...,...,...
12370,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1,5
12371,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1,7
12372,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0,6
12373,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0,8


In [None]:
%cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
# Alpaca-Lora 모델은 아래와 같은 방식의 json 파일을 학습합니다. 아래처럼 json파일을 만들어주세요.

law_dicts_train = []

for idx, row in train_extended.iterrows():
    law_dict = {'instruction' : '', 'input' : '', 'output' : ''}
    law_dict['instruction'] = f"Which party won this legal case? The first party is {row['first_party']} and The second party is {row['second_party']}."
    law_dict['input'] = f"{row['facts']}"
    if row['first_party_winner'] == 1:
        law_dict['output'] = f"The winner of this case is {row['first_party']}."
    else:
        law_dict['output'] = f"The winner of this case is {row['second_party']}."

    law_dicts_train.append(law_dict)

# JSON 파일 작성
import json

with open("law_dicts_train.json", "w") as json_file:
    json.dump(law_dicts_train, json_file)

In [None]:
# json 파일이 유효한지 확인해봅니다.
with open('/content/drive/MyDrive/law_dicts_train.json') as file:
    try:
        data = json.load(file)
        print("JSON 파일이 유효합니다.")

    except json.JSONDecodeError as e:
        print("JSON 파일이 유효하지 않습니다. 오류 메시지:", e)

JSON 파일이 유효합니다.


In [None]:
# test 파일은 나중에 beautifulsoup이나 selenium 등으로, 학습한 모델의 URL에 접근해서 결과를 얻어올 때 사용하시면 됩니다.
# 필요하지 않다면 test 파일은 json으로 안만드셔도 무방합니다.

law_dicts_test = []

for idx, row in test.iterrows():
    law_dict = {'instruction' : '', 'input' : ''}
    law_dict['instruction'] = f"Who is the winner of below case? The first party is {row['first_party']} and The second party is {row['second_party']}."
    law_dict['input'] = f"{row['facts']}"

    law_dicts_train.append(law_dict)

import json

with open("law_dicts_test.json", "w") as json_file:
    json.dump(law_dicts_test, json_file)

In [None]:
# Test json 파일 유효성 확인
with open('/content/drive/MyDrive/law_dicts_test.json') as file:
    try:
        data_test = json.load(file)
        print("JSON 파일이 유효합니다.")

    except json.JSONDecodeError as e:
        print("JSON 파일이 유효하지 않습니다. 오류 메시지:", e)

JSON 파일이 유효합니다.


In [None]:
# GPT, LLama, Alpaca 등 LLM 모델에 필요한 template 파일을 정의하는 코드입니다.
# 아래 코드는 Alpaca-Lora가 인식하는 template의 형식입니다. 아래처럼 진행해서, 마찬가지로 json 파일로 저장해주시면 됩니다.

prompt_template = {
    "description" : "Alpaca-Lora Custom Template",
    "prompt_input" : (
        "Below is an instruction that describes a task, Paired with an input that provides further context.\n"
        "Writhe a response that appropriately completes the request.\n"
        "### Instruction : \n{instruction}\n\n### Input : \n{input}\n\n### Response:\n"
    ),
    "prompt_no_input" : (
        "Below is an instruction that describes a task.\n"
        "Writhe a response that appropriately completes the request.\n"
        "### Instruction : \n{instruction}\n\n### Input : \n{input}\n\n### Response:\n"
    ),
    "response_split" : "### Response:"
}

with open("/content/drive/MyDrive/alpaca-lora/templates/custom_template.json", "w", encoding='utf-8') as f:
    json.dump(prompt_template, f, ensure_ascii=False)

In [None]:
%cd /content/drive/MyDrive/alpaca-lora

/content/drive/MyDrive/alpaca-lora


In [None]:
# 학습을 진행합니다. 저의 경우 데이터를 Augmentation해서, 10000문장은 학습에 그리고 2000문장은 검증에 이용했습니다.
# 학습 진행 시간은 Colab A100 GPU, 10000문장, 10에폭 기준으로 약 3시간 ~ 3시간 30분 정도가 소요됐습니다.
# 1에폭 정도로는 학습이 거의 안되며, 최소 5에폭은 해줘야한다고 합니다.
# 검증 default값은 2000문장입니다.
# 줄이거나 없애고 싶으시다면 finetune.py에 있는 val_dataset 관련 문구를 수정해주세요!
# 추가적인 부분은 해당 URL 참고바랍니다. https://github.com/tloen/alpaca-lora

!python finetune.py \
    --base_model 'decapoda-research/llama-7b-hf' \
    --data_path '/content/drive/MyDrive/law_dicts_train.json' \
    --output_dir '/content/drive/MyDrive/output' \
    --num_epochs 10 \
    --learning_rate 5e-5 \
    --batch_size 512 \
    --micro_batch_size 16 \
    --prompt_template_name 'custom_template'

In [None]:
'''
학습이 되었다면, 아래 코드를 실행해서 학습한 모델의 output(weights)을 가져와서 실행합니다.
실행하면 URL 코드가 나오는데, URL을 따라 들어가면 우리가 만든 모델이 적용된 홈페이지가 첨부한 PDF파일처럼 나오게되며,
Instruction과 Input을 저희가 train 파일을 json 만들 때 넣었던 형식처럼 넣어주면 답변을 하게됩니다.

*중요*
generate.py 파일을 돌릴 때도 당연히 GPU를 쓰게됩니다.
홈페이지에 들어가면 기본 token값이 128로 되어있는데, 512로 늘려서 예측을 하면 제 기준 GPU 용량이 34.8GB까지 사용되더군요.
아마 기본 GPU로 돌리기에는 token을 낮추거나 해야하지 않을까합니다.
'''

!python generate.py \
    --base_model 'decapoda-research/llama-7b-hf' \
    --lora_weights '/content/drive/MyDrive/output' \
    --prompt_template 'custom_template' \
    --share_gradio