## ESG 관련 tweets 분석을 위한 모델


## PyTorch-Lightning & Transformers🤗

#### PyTorch자체 외 다른 라이브러리들은 기존과 동일하게 사용할 수 있음

In [1]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [2]:
!pwd

/content


In [3]:
!ls

gdrive	sample_data


In [4]:
%cd gdrive/MyDrive/project3

/content/gdrive/MyDrive/project3


In [5]:
!ls

'영화 엔드맨3 개별 추출.csv'		      esg_train.txt
 3조_movie_DL_model.ipynb		      유튜브키워드별텍스트수집.ipynb
'엔트맨3 pred.csv'			      개별유튜브영상댓글수집.ipynb
'영화 리미트 개별 추출.csv'		      lightning_logs
'영화 늑대사냥 개별 추출.csv'		      엘비스_pred.csv
'영화 블랙맨서 와칸다 포에버 개별 추출.csv'   체리마호_pred.csv
 엘비스라벨.csv				      배드럭뱅잉_pred.csv
 체리마호라벨.csv			      아이를위한아이_pred.csv
 배드럭뱅잉라벨.csv			     '블팬 teaser pred.csv'
 아이를위한아이라벨.csv			      twitter_esg_bert.ipynb
'영와 DC 슈퍼 펫 개별 추출.csv'		      version_7
 esg_test.txt


In [6]:
# tools import

try:
    import transformers, emoji, soynlp, pytorch_lightning
except:
    !pip install -U -q transformers emoji soynlp pytorch-lightning

[K     |████████████████████████████████| 4.7 MB 6.6 MB/s 
[K     |████████████████████████████████| 197 kB 68.7 MB/s 
[K     |████████████████████████████████| 416 kB 66.2 MB/s 
[K     |████████████████████████████████| 700 kB 59.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 56.6 MB/s 
[K     |████████████████████████████████| 596 kB 42.6 MB/s 
[K     |████████████████████████████████| 101 kB 12.8 MB/s 
[K     |████████████████████████████████| 5.8 MB 46.4 MB/s 
[K     |████████████████████████████████| 141 kB 50.4 MB/s 
[K     |████████████████████████████████| 419 kB 55.6 MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.2+zzzcolab20220719082949 requires tensorboard<2.9,>=2.8, but you have tensorboard 2.9.1 which is incompatible.[0m


# 추가 패키지 import & 기본 Args 설정

In [7]:
import os
import pandas as pd
from pprint import pprint

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR
from pytorch_lightning import LightningModule, Trainer, seed_everything

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import re
import emoji
from soynlp.normalizer import repeat_normalize

## 기본 학습 Arguments

In [8]:
args = {
    'random_seed': 42, # Random Seed
    'pretrained_model': 'beomi/KcELECTRA-base',  # Transformers PLM name
    'pretrained_tokenizer': '',  # Optional, Transformers Tokenizer Name. Overrides `pretrained_model`
    'batch_size': 32,
    'lr': 5e-6,  # Starting Learning Rate
    'epochs': 3,  # Max Epochs
    'max_length': 150,  # Max Length input size
    'train_data_path': "df_mv_train2.csv",  # Train Dataset file 
    'val_data_path': "ratings_test.txt",  # Validation Dataset file 
    'test_mode': False,  # Test Mode enables `fast_dev_run`
    'optimizer': 'AdamW',  # AdamW vs AdamP
    'lr_scheduler': 'exp',  # ExponentialLR vs CosineAnnealingWarmRestarts
    'fp16': True,  # Enable train on FP16(if GPU)
    'tpu_cores': 0,  # Enable TPU with 1 core or 8 cores
    'cpu_workers': os.cpu_count(),
}

In [9]:
args

{'batch_size': 32,
 'cpu_workers': 2,
 'epochs': 3,
 'fp16': True,
 'lr': 5e-06,
 'lr_scheduler': 'exp',
 'max_length': 150,
 'optimizer': 'AdamW',
 'pretrained_model': 'beomi/KcELECTRA-base',
 'pretrained_tokenizer': '',
 'random_seed': 42,
 'test_mode': False,
 'tpu_cores': 0,
 'train_data_path': 'df_mv_train2.csv',
 'val_data_path': 'ratings_test.txt'}

# Model 만들기 with Pytorch Lightning

In [None]:
#GPU 사용 확인

!nvidia-smi

Mon Aug  8 02:09:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# 모델을 함수화

class Model(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters() # 이 부분에서 self.hparams에 위 kwargs가 저장된다.
        
        self.clsfier = AutoModelForSequenceClassification.from_pretrained(self.hparams.pretrained_model)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.hparams.pretrained_tokenizer
            if self.hparams.pretrained_tokenizer
            else self.hparams.pretrained_model
        )

    def forward(self, **kwargs):
        return self.clsfier(**kwargs)

    def step(self, batch, batch_idx):
        data, labels = batch
        output = self(input_ids=data, labels=labels)

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)

        y_true = list(labels.cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def epoch_end(self, outputs, state='train'):
        loss = torch.tensor(0, dtype=torch.float)
        for i in outputs:
            loss += i['loss'].cpu().detach()
        loss = loss / len(outputs)

        y_true = []
        y_pred = []
        for i in outputs:
            y_true += i['y_true']
            y_pred += i['y_pred']
        
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        self.log(state+'_loss', float(loss), on_epoch=True, prog_bar=True)
        self.log(state+'_acc', acc, on_epoch=True, prog_bar=True)
        self.log(state+'_precision', prec, on_epoch=True, prog_bar=True)
        self.log(state+'_recall', rec, on_epoch=True, prog_bar=True)
        self.log(state+'_f1', f1, on_epoch=True, prog_bar=True)
        print(f'[Epoch {self.trainer.current_epoch} {state.upper()}] Loss: {loss}, Acc: {acc}, Prec: {prec}, Rec: {rec}, F1: {f1}')
        return {'loss': loss}
    
    def training_epoch_end(self, outputs):
        self.epoch_end(outputs, state='train')

    def validation_epoch_end(self, outputs):
        self.epoch_end(outputs, state='val')

    def configure_optimizers(self):
        if self.hparams.optimizer == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        elif self.hparams.optimizer == 'AdamP':
            from adamp import AdamP
            optimizer = AdamP(self.parameters(), lr=self.hparams.lr)
        else:
            raise NotImplementedError('Only AdamW and AdamP is Supported!')
        if self.hparams.lr_scheduler == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        elif self.hparams.lr_scheduler == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        else:
            raise NotImplementedError('Only cos and exp lr scheduler is Supported!')
        return {
            'optimizer': optimizer,
            'scheduler': scheduler,
        }

    def read_data(self, path):
        if path.endswith('xlsx'):
            return pd.read_excel(path)
        elif path.endswith('csv'):
            return pd.read_csv(path)
        elif path.endswith('tsv') or path.endswith('txt'):
            return pd.read_csv(path, sep='\t')
        else:
            raise NotImplementedError('Only Excel(xlsx)/Csv/Tsv(txt) are Supported')

    def clean(self, x):
        #emojis = ''.join(emoji.UNICODE_EMOJI.keys())
        #pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
        pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣]+')
        url_pattern = re.compile(
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
        x = pattern.sub(' ', x)
        x = url_pattern.sub('', x)
        x = x.strip()
        x = repeat_normalize(x, num_repeats=2)
        return x

    def encode(self, x, **kwargs):
        return self.tokenizer.encode(
            self.clean(str(x)),
            padding='max_length',
            max_length=self.hparams.max_length,
            truncation=True,
            **kwargs,
        )

    def preprocess_dataframe(self, df):
        df['document'] = df['document'].map(self.encode)
        return df

    def dataloader(self, path, shuffle=False):
        df = self.read_data(path)
        df = self.preprocess_dataframe(df)

        dataset = TensorDataset(
            torch.tensor(df['document'].to_list(), dtype=torch.long),
            torch.tensor(df['label'].to_list(), dtype=torch.long),
        )
        return DataLoader(
            dataset,
            batch_size=self.hparams.batch_size * 1 if not self.hparams.tpu_cores else self.hparams.tpu_cores,
            shuffle=shuffle,
            num_workers=self.hparams.cpu_workers,
        )

    def train_dataloader(self):
        return self.dataloader(self.hparams.train_data_path, shuffle=True)

    def val_dataloader(self):
        return self.dataloader(self.hparams.val_data_path, shuffle=False)


In [None]:
# 모델 체크포인트

from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filename='epoch{epoch}-val_acc{val_acc:.4f}',
    monitor='val_acc',
    save_top_k=3,
    mode='max',
    auto_insert_metric_name=False,
)

# 학습!


```python
# 1epoch
loss=0.207, v_num=0, val_loss=0.221, val_acc=0.913, val_precision=0.914, val_recall=0.913, val_f1=0.914
# 2epoch
loss=0.152, v_num=0, val_loss=0.213, val_acc=0.918, val_precision=0.912, val_recall=0.926, val_f1=0.919
# 3epoch
loss=0.135, v_num=0, val_loss=0.225, val_acc=0.919, val_precision=0.907, val_recall=0.936, val_f1=0.921
```

In [None]:
# trainer 함수를 통해 모델 fitting

print("Using PyTorch Ver", torch.__version__)
print("Fix Seed:", args['random_seed'])
seed_everything(args['random_seed'])
model = Model(**args)

print(":: Start Training ::")
trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=args['epochs'],
    fast_dev_run=args['test_mode'],
    num_sanity_val_steps=None if args['test_mode'] else 0,
    # For GPU Setup
    deterministic=torch.cuda.is_available(),
    gpus=[0] if torch.cuda.is_available() else None,  # 0번 idx GPU  사용
    precision=16 if args['fp16'] and torch.cuda.is_available() else 32,
    # For TPU Setup
    # tpu_cores=args['tpu_cores'] if args['tpu_cores'] else None,
)
trainer.fit(model)

Global seed set to 42


Using PyTorch Ver 1.12.0+cu113
Fix Seed: 42


Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.weight', 'classifi

:: Start Training ::


  f"Setting `Trainer(gpus={gpus!r})` is deprecated in v1.7 and will be removed"
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                             | Params
-------------------------------------------------------------
0 | clsfier | ElectraForSequenceClassification | 124 M 
-------------------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
249.093   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[Epoch 0 VAL] Loss: 0.22368168830871582, Acc: 0.91422, Prec: 0.9048542183622829, Rec: 0.927104437293926, F1: 0.915844206808594
[Epoch 0 TRAIN] Loss: 0.23545993864536285, Acc: 0.9050896971691903, Prec: 0.9135622854371658, Rec: 0.9376444930545095, F1: 0.9254467474458876


Validation: 0it [00:00, ?it/s]

[Epoch 1 VAL] Loss: 0.21432745456695557, Acc: 0.91862, Prec: 0.9139987445072191, Rec: 0.9254359829976562, F1: 0.9196818065178342
[Epoch 1 TRAIN] Loss: 0.17586740851402283, Acc: 0.9325625411454904, Prec: 0.9424401581499828, Rec: 0.9507233657958334, F1: 0.9465636411058947


Validation: 0it [00:00, ?it/s]

[Epoch 2 VAL] Loss: 0.2161635160446167, Acc: 0.91788, Prec: 0.9029493898473662, Rec: 0.9376713145036348, F1: 0.9199828506840239
[Epoch 2 TRAIN] Loss: 0.14856231212615967, Acc: 0.9446305134957209, Prec: 0.95275165515537, Rec: 0.9594469804635566, F1: 0.9560875963857192


`Trainer.fit` stopped: `max_epochs=3` reached.


# Inference

In [None]:
from glob import glob

latest_ckpt = sorted(glob('./lightning_logs/version_7/checkpoints/*.ckpt'))[-1]
latest_ckpt

'./lightning_logs/version_7/checkpoints/epoch2-val_acc0.9179.ckpt'

In [None]:
model = Model.load_from_checkpoint(latest_ckpt)

Downloading config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

Downloading tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/387k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def infer(x):
    return torch.softmax(
        model(**model.tokenizer(x, return_tensors='pt')
    ).logits, dim=-1)

In [None]:
infer('한국인인데 미국 국뽕이 차오르다니')

tensor([[0.0051, 0.9949]], grad_fn=<SoftmaxBackward0>)

In [None]:
infer('어 그냥 조용히 하고 그냥 보고와 후회는 절대 안해. 이런게 예술이지')

tensor([[0.0017, 0.9983]], grad_fn=<SoftmaxBackward0>)

In [None]:
print(infer('어 그냥 조용히 하고 그냥 보고와 후회는 절대 안해. 이런게 예술이지')[0][0])
print(infer('어 그냥 조용히 하고 그냥 보고와 후회는 절대 안해. 이런게 예술이지')[0][1])

tensor(0.0693, grad_fn=<SelectBackward0>)

In [10]:
df_black = pd.read_csv('영화 블랙맨서 와칸다 포에버 개별 추출.csv')
df_black.head()

Unnamed: 0.1,Unnamed: 0,comment
0,0,"안녕하세요, 올원입니다._x000D_<br>_x000D_<br>오늘 댓글에서는 영상..."
1,1,다른 겉할기마블유투버보다 올원님이 최고입니다. 모두 구독해주세요.
2,2,토닥토닥...
3,3,마블빠로서 마블띵크 삐맨 올원채널 삐맨 욘두의마블 등등 신생채널 부터 두루 보긴하...
4,4,회원제 운영도한번해보새요 후원해주실분들많을거에요!


In [11]:
df_wolf = pd.read_csv('영화 늑대사냥 개별 추출.csv')
df_wolf.head()

Unnamed: 0.1,Unnamed: 0,comment
0,0,여성의 시대로 변하겠지 pc로 범벅된.
1,1,원작처럼 슈리가 2대 블랙팬서가 될 지 다른 인물이 될 지...
2,2,이제 마블 절대 호평 나오지 않는한 안 봄.
3,3,재미를 떠나서 체드윅배우 추모의미로 봐야할것같은..
4,4,슈리야 머 원래 블팬할거라 생각했는데...아이언 하트는 좀 실망스러움....<br>...


In [12]:
df_ant = pd.read_csv('영화 엔드맨3 개별 추출.csv')
df_ant.head()

Unnamed: 0.1,Unnamed: 0,comment
0,0,아직 코가 맹맹하네<br><br><br><br><br><br><br><br><br>...
1,1,저는 썸네일로 유출이고 자시고 하면서 영화 개봉 전 정보 알고 싶지 않아하는 사람인...
2,2,도르마무 캉
3,3,무띵형 오타났어 퀀터가아니라 퀀텀이야
4,4,저도 걸렸어요. 무띵형도 힘내요!


In [None]:
# test set을 개별적으로 prediction

infer_list =[]
for i in df_black['comment']:
    tmp = infer(str(i))
    if tmp[0][0] > tmp[0][1]:
        infer_list.append(0)
    else:
        infer_list.append(1)

In [None]:
df_black['pred'] = infer_list
df_black

Unnamed: 0,id,document,label,pred
92502,8292,그냥 한마디로 스토리가 엉망. 배우들이 아깝다고 생각드는 영화네요,0,0
89127,4917,생명 그 자체에 대한 소중함과 그 생명을 키우기 위해 필요한 것들은 무엇인지 그리고...,1,1
104997,11957,,1,1
49298,13048,마동석도시2 그냥 말이 뭐가 필요하,1,1
79705,6425,"캡틴마블, 이터널스, 심지어 상치도 봐줄만 했다.그런데 이건 아니다.",0,0
...,...,...,...,...
66069,2899,각박하고 건조한 일상에 묵직한 성찰의 울림으로 다가온 영화.,1,1
66408,3238,실화를 바탕으로 꽤나 괜찮은 영화 입니다.남자주인공 에단호크 연기력 최고!,1,1
25254,25254,아무 생각없이 1등영화네 이러면서 봤는데 알고보니 탑건1도 있었고..ㅋㅋ 정말 넘 ...,1,1
29680,29680,,1,1


In [None]:
# test set을 개별적으로 prediction

infer_list =[]
for i in df_wolf['comment']:
    tmp = infer(str(i))
    if tmp[0][0] > tmp[0][1]:
        infer_list.append(0)
    else:
        infer_list.append(1)

df_wolf['pred'] = infer_list
df_wolf

In [None]:
# test set을 개별적으로 prediction

infer_list =[]
for i in df_ant['comment']:
    tmp = infer(str(i))
    if tmp[0][0] > tmp[0][1]:
        infer_list.append(0)
    else:
        infer_list.append(1)

df_ant['pred'] = infer_list
df_ant

In [None]:
df_black.to_csv('블랙팬서2 pred.csv')

In [None]:
df_wolf.to_csv('늑대사냥 pred.csv')

In [None]:
df_ant.to_csv('엔트맨3 pred.csv')