## example

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import torch

# BERT 모델 및 토크나이저 불러오기
model_name = 'bert-base-uncased'  # 원하는 BERT 모델 선택
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# 텍스트 입력 예시
text = "병원에 가서 진료를 받은 경험이 너무 좋았어요."

# 토큰화 및 모델 입력 형식으로 변환
tokens = tokenizer(text, return_tensors='pt')
with torch.no_grad():
    # 모델에 입력하여 로짓 받아오기
    logits = model(**tokens).logits

# 확률로 변환
probs = softmax(logits, dim=1).squeeze().tolist()

# 결과 출력
print(f"긍정 확률: {probs[1]:.2%}, 부정 확률: {probs[0]:.2%}")


  from .autonotebook import tqdm as notebook_tqdm
model.safetensors: 100%|██████████| 440M/440M [00:40<00:00, 10.9MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


긍정 확률: 43.31%, 부정 확률: 56.69%


## 중랑구

### 전처리

In [3]:
import pandas as pd

review_jungrang = pd.read_csv('C:/Users/user/Desktop/saltlux_project/preprocess/JungrangTotal.csv', index_col=0)
review_jungrang = review_jungrang[['name', 'nickname', 'content']].reset_index(drop=True)

In [4]:
# 결측치 제거 -> null값 없음
review_jungrang.dropna(inplace=True)

In [5]:
# info 확인
review_jungrang.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5975 entries, 0 to 5974
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      5975 non-null   object
 1   nickname  5975 non-null   object
 2   content   5975 non-null   object
dtypes: object(3)
memory usage: 140.2+ KB


In [6]:
# 중복값 제거
review_jungrang = review_jungrang.drop_duplicates()
review_jungrang.shape

(5264, 3)

In [8]:
review_jungrang['content'] = review_jungrang['content'].str.replace('\n', ' ')
review_jungrang['content'] = review_jungrang['content'].str.replace('[~!]', ' ', regex=True)
# review_jungrang['content'] = review_jungrang['content'].str.replace('[^가-힣]', ' ', regex=True)
review_jungrang

Unnamed: 0,name,nickname,content
0,365mc모인이비인후과의원,Wiseburge David,사가정 모인 이비인후과 항상 만원이다
1,365mc모인이비인후과의원,Dk9,좋아요
2,365mc모인이비인후과의원,귀한집딸z,좋아요
3,365mc모인이비인후과의원,꽃길걷는중임,항상 친절한 진료 감사합니다
4,365mc모인이비인후과의원,막둥,
...,...,...,...
5970,효치과의원,fpvocalist,굳
5971,효치과의원,소보루39,좋아요
5972,효치과의원,1vvovv1,친절하십니다 과잉진료 없음
5973,효치과의원,버럭아저씨,친절해요


In [9]:
review_jungrang = review_jungrang.reset_index(drop=True)
review_jungrang

Unnamed: 0,name,nickname,content
0,365mc모인이비인후과의원,Wiseburge David,사가정 모인 이비인후과 항상 만원이다
1,365mc모인이비인후과의원,Dk9,좋아요
2,365mc모인이비인후과의원,귀한집딸z,좋아요
3,365mc모인이비인후과의원,꽃길걷는중임,항상 친절한 진료 감사합니다
4,365mc모인이비인후과의원,막둥,
...,...,...,...
5259,효치과의원,fpvocalist,굳
5260,효치과의원,소보루39,좋아요
5261,효치과의원,1vvovv1,친절하십니다 과잉진료 없음
5262,효치과의원,버럭아저씨,친절해요


### gpu

In [6]:
import torch
import torchvision

# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # 모델 생성
# model = torchvision.models.resnet18()

# # 모델을 CUDA 디바이스로 이동
# model.to(device)

# # 데이터를 GPU 하기
# data = torch.randn(10,3).to(device)

# # 텐서 생성 및 gpu 할당
# x = torch.tensor([1,2,3]).to(device)
# y = torch.tensor([4,5,6]).to(device)

NameError: name '_C' is not defined

### 모델

#### 전처리 포함

In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import torch

# 'content' 컬럼의 데이터를 리스트로 추출
texts = review_jungrang['content'].tolist()

# BERT 모델 및 토크나이저 불러오기
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# 결과를 저장할 리스트 초기화
results = []

# 각 텍스트에 대해 확률 계산 및 결과 저장
for text in texts:
    tokens = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        logits = model(**tokens).logits

    probs = softmax(logits, dim=1).squeeze().tolist()
    result_str = f"긍정 확률: {probs[1]:.2%}, 부정 확률: {probs[0]:.2%}"
    results.append({'Text': text, 'Result': result_str})

# 결과를 DataFrame으로 변환
results_df = pd.DataFrame(results)

# 결과를 CSV 파일로 저장
results_df.to_csv('bert_results_prep_prob.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 전처리 미포함

In [12]:
import pandas as pd

review_jungrang_ori = pd.read_csv('C:/Users/user/Desktop/saltlux_project/preprocess/JungrangTotal.csv', index_col=0)
review_jungrang_ori = review_jungrang_ori[['name', 'nickname', 'content']].reset_index(drop=True)

In [13]:
# info 확인
review_jungrang_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5975 entries, 0 to 5974
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      5975 non-null   object
 1   nickname  5975 non-null   object
 2   content   5975 non-null   object
dtypes: object(3)
memory usage: 140.2+ KB


In [14]:
review_jungrang_ori['content'] = review_jungrang_ori['content'].str.replace('\n', ' ')
review_jungrang_ori = review_jungrang_ori.reset_index(drop=True)
review_jungrang_ori

Unnamed: 0,name,nickname,content
0,365mc모인이비인후과의원,Wiseburge David,사가정 모인 이비인후과 항상 만원이다
1,365mc모인이비인후과의원,Dk9,좋아요
2,365mc모인이비인후과의원,귀한집딸z,좋아요
3,365mc모인이비인후과의원,꽃길걷는중임,항상 친절한 진료 감사합니다~
4,365mc모인이비인후과의원,막둥,ㅇㅇ
...,...,...,...
5970,효치과의원,fpvocalist,굳
5971,효치과의원,소보루39,좋아요
5972,효치과의원,1vvovv1,친절하십니다 과잉진료 없음
5973,효치과의원,버럭아저씨,친절해요..


In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import torch

# 'content' 컬럼의 데이터를 리스트로 추출
texts = review_jungrang_ori['content'].tolist()

# BERT 모델 및 토크나이저 불러오기
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# 결과를 저장할 리스트 초기화
results = []

# 각 텍스트에 대해 확률 계산 및 결과 저장
for text in texts:
    tokens = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        logits = model(**tokens).logits

    probs = softmax(logits, dim=1).squeeze().tolist()
    result_str = f"긍정 확률: {probs[1]:.2%}, 부정 확률: {probs[0]:.2%}"
    results.append({'Text': text, 'Result': result_str})

# 결과를 DataFrame으로 변환
results_df = pd.DataFrame(results)

# 결과를 CSV 파일로 저장
results_df.to_csv('bert_results_ori_prob.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
