In [1]:
import wandb
import time
import numpy as np
import pandas as pd

import tensorflow as tf
import torch

import re
import os

import datasets
from datasets import load_dataset, load_metric, ClassLabel, Sequence, Dataset
from multiprocessing import Pool
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split as tts

from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
import matplotlib.pyplot as plt

print('torch version:', torch.__version__)
print('tf version:', tf.__version__)

2022-09-19 08:40:27.076507: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-19 08:40:27.238910: I tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: libtpu.so




[percpu.cc : 535] RAW: rseq syscall failed with errno 22 after membarrier sycall succeeded.


torch version: 1.12.0+cu102
tf version: 2.10.0


In [2]:
# using TPU through torch
import torch_xla
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.utils.serialization as xser
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

print(torch_xla.__version__)

1.12


In [3]:
# random seed fix
import random

random.seed(2022)
torch.manual_seed(2022)
np.random.seed(2022)

In [4]:
# ## Google cloud project에서 TPU 셋팅

# # .py로 실행할 때 TPU 셋팅 명령어
# #!export XRT_TPU_CONFIG="localservice;0;localhost:51011"

# # 주피터 노트 또는 주피터 랩에서 실행할 때, TPU 셋팅 명령어
import os
os.environ['XRT_TPU_CONFIG'] = "localservice;0;localhost:51011"
device = xm.xla_device()
device

device(type='xla', index=1)

## 원하시는 모델을 불러오세요

In [5]:
# load model, tokenizer
model = AutoModelForSequenceClassification.from_pretrained('jungyong/FT_batch32_lyric')
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')
pipe = TextClassificationPipeline(model = model, tokenizer = tokenizer, return_all_scores=True, batch_size = 32, num_workers = 85) #device = device,

Downloading config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tcmalloc: large alloc 2015035392 bytes == 0x92b9e000 @  0x7ffa44066680 0x7ffa44087824 0x7ffa44087b8a 0x7ff883a9b32e 0x7ff883a86da2 0x7ff8b6e4a451 0x7ff8cd567409 0x7ff8cd2108d5 0x5f6929 0x5f74f6 0x50c383 0x570b26 0x569dba 0x5f6eb3 0x5f6082 0x56d2d5 0x569dba 0x5f6eb3 0x56cc1f 0x5f6cd6 0x56bacd 0x569dba 0x5f6eb3 0x50bc2c 0x5f6082 0x56d2d5 0x569dba 0x50bca0 0x570b26 0x569dba 0x6902a7
tcmalloc: large alloc 2015035392 bytes == 0x10ad4e000 @  0x7ffa44066680 0x7ffa44087824 0x5fb391 0x7ff8cd567422 0x7ff8cd2108d5 0x5f6929 0x5f74f6 0x50c383 0x570b26 0x569dba 0x5f6eb3 0x5f6082 0x56d2d5 0x569dba 0x5f6eb3 0x56cc1f 0x5f6cd6 0x56bacd 0x569dba 0x5f6eb3 0x50bc2c 0x5f6082 0x56d2d5 0x569dba 0x50bca0 0x570b26 0x569dba 0x6902a7 0x6023c4 0x5c6730 0x56bacd


In [6]:
from transformers import pipeline
pipe  = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores = True)

## 데이터를 불러오세요

In [7]:
lyric = pd.read_csv("lyric", sep = '\t')
print(len(lyric))
lyric.head()

112512


Unnamed: 0,lyric
0,Feelings nothing more than feelings Trying to ...
1,i feel alright now but don't know how to speak...
2,그대 스치는 바람처럼 불어와서 내 곁에 머무른 사람 나도 몰래 내쉬는 숨처럼 익숙해...
3,그냥 생각 없이 이렇다 할 뜻도 없이 쉼 없이 웃으며 떠드는 이들을 가만히 두리번거...
4,너를보면 나는 잠이와 이상하다 그치 잠이오면 나는 잠을자 이상하다 그치 자면서 너에...


In [8]:
import itertools

def queue_pop_lyric(data):
    for number in range(len(data)):
        yield number

def sentimental_analysis(number):
    temp = pd.DataFrame(*pipe(lyric.iloc[number,0])).transpose()
    temp.columns = temp.loc['label'].values
    result = temp.iloc[1:,:]
    return result

In [9]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [10]:
start = time.time()
pool = Pool(85)
df_score = pd.concat(map(sentimental_analysis, queue_pop_lyric(lyric))).reset_index(drop = True)
pool.close()
pool.join()
print("time :", time.time() - start)

time : 21868.65183210373


In [11]:
df_score

Unnamed: 0,LABEL_0,LABEL_1,LABEL_2,LABEL_3,LABEL_4,LABEL_5
0,0.321668,0.050276,0.000602,0.013912,0.100803,0.512739
1,0.211127,0.195028,0.000657,0.017706,0.143551,0.431931
2,0.946268,0.002743,0.003444,0.031837,0.003763,0.011944
3,0.003543,0.029312,0.000641,0.821336,0.136336,0.008832
4,0.239259,0.64258,0.001295,0.056139,0.04051,0.020217
...,...,...,...,...,...,...
112507,0.649147,0.047277,0.000647,0.006467,0.04091,0.255551
112508,0.050572,0.032464,0.000567,0.006616,0.168869,0.740912
112509,0.040549,0.041491,0.000643,0.071288,0.513658,0.332371
112510,0.010605,0.014387,0.000957,0.872247,0.087935,0.013869


In [13]:
df_score.to_csv("temp.tsv", sep = '\t')

In [16]:
df_lyric_score= pd.concat([lyric, df_score], axis = 1)
df_lyric_score.to_csv("score_test.tsv",sep = '\t')