# GPT2

## Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Install Mecab

In [2]:
!sudo apt-get install g++ openjdk-7-jdk # Install Java 1.7+
# !sudo apt-get install python-dev; pip install konlpy     # Python 2.x
!sudo apt-get install python3-dev; pip3 install konlpy   # Python 3.x
!sudo apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Package openjdk-7-jdk is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source

E: Package 'openjdk-7-jdk' has no installation candidate
Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-dev is already the newest version (3.6.7-1~18.04).
python3-dev set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.
Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.0 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 57.4 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecti

## Install Transformers

In [3]:
pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.2 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 79.8 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 62.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

## Import modules

In [4]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2ForSequenceClassification
from konlpy.tag import Mecab
warnings.filterwarnings('ignore')
mecab = Mecab()

## Import Data

In [5]:
#################
STOPWORDSPATH ="/content/drive/MyDrive/Colab Notebooks/stopwords.txt"
#################
## Import DATA, submission file
train = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/train_data.csv", error_bad_lines=False  )
test = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/test_data.csv",error_bad_lines=False)
submission = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/sample_submission.csv",error_bad_lines=False)
topic_dict = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/topic_dict.csv",error_bad_lines=False)

In [42]:
submission = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/sample_submission.csv",error_bad_lines=False)

## Text Preprocessing 

In [6]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
##
##
def clean_punc(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text.strip()
##
##
cleaned_train_corpus = []
cleaned_test_corpus = []

for sent in train['title']:
    cleaned_train_corpus.append(clean_punc(sent, punct, punct_mapping))
    
for sent in test['title']:
    cleaned_test_corpus.append(clean_punc(sent, punct, punct_mapping))

##
##
def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        texts[i] = texts[i].replace("外人","외국인")
        texts[i] = texts[i].replace("日","일본")
        texts[i] = texts[i].replace("美","미국")
        texts[i] = texts[i].replace("北","북한")
        texts[i] = texts[i].replace("英","영국")
        texts[i] = texts[i].replace("中","중국")
        texts[i] = texts[i].replace("與","여당")
        texts[i] = texts[i].replace("靑","청와대")
        texts[i] = texts[i].replace("野","야당")
        texts[i] = texts[i].replace("伊","이탈리아")
        texts[i] = texts[i].replace("韓","한국")
        texts[i] = texts[i].replace("南","한국")
        texts[i] = texts[i].replace("獨","독일")
        texts[i] = texts[i].replace("佛","프랑스")
        texts[i] = texts[i].replace("檢","검찰")
        texts[i] = texts[i].replace("銀","은행")
        texts[i] = texts[i].replace("亞","아시아")
        texts[i] = texts[i].replace("人","사람")
        texts[i] = texts[i].replace("孫","손혜원")
        texts[i] = texts[i].replace("企","기업")
        texts[i] = texts[i].replace("前","이전")
        texts[i] = texts[i].replace("反","반대")
        texts[i] = texts[i].replace("安","안철수")
        texts[i] = texts[i].replace("展","전시회")
        texts[i] = texts[i].replace("故","사망")
        texts[i] = texts[i].replace("文","문재인")
        texts[i] = texts[i].replace("新","새로운")
        texts[i] = texts[i].replace("曺","조국")
        texts[i] = texts[i].replace("朴","박정치인")
        texts[i] = texts[i].replace("株","주식")
        texts[i] = texts[i].replace("男","남자")
        texts[i] = texts[i].replace("硏","연구")
        texts[i] = texts[i].replace("車","자동차")
        texts[i] = texts[i].replace("軍","군대")
        texts[i] = texts[i].replace("重","중공업")
        texts[i] = texts[i].replace("R&D","연구개발")
        texts[i] = texts[i].replace("문정부","문재인정부")
        


        
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(texts[i])) #remove punctuation
        review = re.sub(r'\d+','', str(review))# remove number
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub(r'<[^>]+>','',review) #remove Html tags
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        review = re.sub("[一-龥]",'', review) #remove chinese char


        corpus.append(review)
    return corpus
##
##
basic_preprocessed_train_corpus = clean_text(cleaned_train_corpus)
basic_preprocessed_test_corpus = clean_text(cleaned_test_corpus)
##
##
stopwords = []
with open("/content/drive/MyDrive/공민표/data/stopwords.txt") as f:
    for line in f:
        stopwords.append(line.strip())

removed_stopword_train_corpus = []
removed_stopword_test_corpus = []

for tagged in basic_preprocessed_train_corpus:
    tagged=mecab.pos(tagged)
    
    temp = []
    for tag in tagged:
        #일반명사, 고유명사, 동사, 형용사, 긍정지정사, 부정 지정사, 관형사, 일반부사, 체언접두사, 외국어, 한자
        if tag[0] in stopwords or tag[1] not in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
            continue
        temp.append(tag[0])

    removed_stopword_train_corpus.append(' '.join(temp))
    
for tagged in basic_preprocessed_test_corpus:
    tagged=mecab.pos(tagged)
    
    temp = []
    for tag in tagged:
        if tag[0] in stopwords or tag[1] not in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
            continue
        temp.append(tag[0])

    removed_stopword_test_corpus.append(' '.join(temp))

##
##
train_text = removed_stopword_train_corpus
test_text = removed_stopword_test_corpus
train_label = np.asarray(train.topic_idx)

train['cl_title'] = train_text
test['cl_title'] = test_text

In [7]:
train.head(10)

Unnamed: 0,index,title,topic_idx,cl_title
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4,인천 핀란드 항공기 결항 휴가철 여행객 분통
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4,실리콘밸리 넘어서 구글 조원 미국 전역 거점
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4,이란 외무 긴장 완화 해결책 미국 경제 전쟁 멈추
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4,nyt 클린턴 측근 한국 기업 특수 관계 조명 공과 종합
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4,시진핑 트럼프 중미 무역 협상 조속 타결 희망
5,5,팔레스타인 가자지구서 16세 소년 이스라엘군 총격에 사망,4,팔레스타인 가 세 소년 이스라엘 총격 사망
6,6,인도 48년 만에 파키스탄 공습…테러 캠프 폭격종합2보,4,인도 파키스탄 공습 테러 캠프 폭격 종 합보
7,7,美대선 TV토론 음담패설 만회실패 트럼프…사과 대신 빌클린턴 공격해 역효과,4,미국 대선 tv 토론 음담패설 만회 실패 트럼프 사과 대신 빌클린턴 공격 역효과
8,8,푸틴 한반도 상황 진전 위한 방안 김정은 위원장과 논의,4,푸틴 한반도 상황 진전 방안 김정은 위원장 논의
9,9,특검 면죄부 받은 트럼프 스캔들 보도 언론 맹공…국민의 적,4,특검 면죄부 받 트럼프 스캔들 보도 언론 맹공 국민 적


In [None]:
test.head(10)

Unnamed: 0,index,title,cl_title
0,45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영,유튜브 내달 일 크리에이터 지원 공간 운영
1,45655,어버이날 맑다가 흐려져…남부지방 옅은 황사,어버이날 맑 남부 지방 옅 황사
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다,내년 국가 rd 평가 때 논문 건수 반영 않
3,45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,김명자 신임 과 총 회장 원로 젊 과학자 지혜
4,45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,회색 인간 작가 김동식 심 새 소설 집 출간
5,45659,야외서 생방송 하세요…액션캠 전용 요금제 잇따라,야외 생 방송 하 액션 캠 전용 요금제
6,45660,월드컵 태극전사 16강 전초기지 레오강 입성종합,월드컵 태극전사 강 전 초기 레오강 입 종합
7,45661,미세먼지 속 출근길,미세먼지 속 출근길
8,45662,왓츠앱稅 230원에 성난 레바논 민심…총리사퇴로 이어져종합2보,왓츠 앱 원 레바논 민심 총리 사퇴 종 합보
9,45663,베트남 경제 고성장 지속…2분기 GDP 6.71% 성장,베트남 경제 고성장 지속 분기 gdp 성장


## Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2")

model = GPT2ForSequenceClassification.from_pretrained("skt/kogpt2-base-v2")
model.score = torch.nn.Linear(768, 7)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1000.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2825034.0, style=ProgressStyle(descript…




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513302779.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=Fal

In [10]:
class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document, label = str(record['cl_title']), int(record['topic_idx'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float),
                'labels': np.array(label, dtype=np.int_)}
    
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document = str(record['cl_title'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float)}

In [21]:
my_learning_rate = 3E-6 # default is 5E-5
my_adam_epsilon = 1E-8 # default is 1E-8
my_number_of_epochs = 7
my_warmup = 3
my_mini_batch_size = 128
total_steps = len(loader) * epochs

In [11]:
# train parameters
epochs = 10
batch_size = my_mini_batch_size

In [12]:
# train loader
train_ds = TrainDataset(train, tokenizer)
loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, shuffle=True)

In [22]:
from transformers import get_linear_schedule_with_warmup

In [23]:
# optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = my_learning_rate, #args.learning_rate
                  eps = my_adam_epsilon  #args.adam_epsilon

                )
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5, )
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = my_warmup, 
                                            num_training_steps = total_steps)
loss_fn = torch.nn.CrossEntropyLoss()

In [29]:

import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [32]:
model.train()
for e in range(epochs):
    total_loss = 0
    print("")
    print('======== Epoch {:} / {:} ========'.format(e + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()
    for step, batch in enumerate(loader):
        if step % 50 == 0 and not step == 0:
        # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(loader), elapsed))

        optimizer.zero_grad()
        ids, atts, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        ids = torch.tensor(ids).long().cuda()
        atts = torch.tensor(atts).long().cuda()
        labels = torch.tensor(labels).long().cuda()
        pred = model(ids, attention_mask=atts)
        loss = loss_fn(pred[0], labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(loader) 
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))        
    scheduler.step()
    print(e, total_loss)
print("")
print("Training complete!")


Training...
  Batch    50  of  1,427.    Elapsed: 0:00:15.
  Batch   100  of  1,427.    Elapsed: 0:00:29.
  Batch   150  of  1,427.    Elapsed: 0:00:44.
  Batch   200  of  1,427.    Elapsed: 0:00:58.
  Batch   250  of  1,427.    Elapsed: 0:01:13.
  Batch   300  of  1,427.    Elapsed: 0:01:27.
  Batch   350  of  1,427.    Elapsed: 0:01:42.
  Batch   400  of  1,427.    Elapsed: 0:01:56.
  Batch   450  of  1,427.    Elapsed: 0:02:11.
  Batch   500  of  1,427.    Elapsed: 0:02:25.
  Batch   550  of  1,427.    Elapsed: 0:02:40.
  Batch   600  of  1,427.    Elapsed: 0:02:54.
  Batch   650  of  1,427.    Elapsed: 0:03:09.
  Batch   700  of  1,427.    Elapsed: 0:03:23.
  Batch   750  of  1,427.    Elapsed: 0:03:38.
  Batch   800  of  1,427.    Elapsed: 0:03:52.
  Batch   850  of  1,427.    Elapsed: 0:04:07.
  Batch   900  of  1,427.    Elapsed: 0:04:21.
  Batch   950  of  1,427.    Elapsed: 0:04:36.
  Batch 1,000  of  1,427.    Elapsed: 0:04:50.
  Batch 1,050  of  1,427.    Elapsed: 0:05:05.


In [33]:
# test loader
test_ds = TestDataset(test, tokenizer)
test_loader = DataLoader(test_ds, 8)

In [34]:
preds = []
model.eval()
preds_1 = []
for b in tqdm(test_loader):
    ids, atts = b['input_ids'], b['attention_mask']
    ids = torch.tensor(ids).long().cuda()
    atts = torch.tensor(atts).long().cuda()
    pred = model(ids, attention_mask=atts)
    logits1 = pred[0]
    logits1 = logits1.detach().cpu().numpy()
    preds_1.append(logits1)
    preds += list(np.argmax(pred[0].detach().cpu().numpy(), 1))
#     break
flat_predictions_1 = [item for sublist in preds_1 for item in sublist]

100%|██████████| 1142/1142 [00:26<00:00, 43.92it/s]


## Submission

In [63]:
submission = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/sample_submission.csv",error_bad_lines=False)

In [62]:
submission

Unnamed: 0,index,topic_idx,0,1,2,3,4,5,6
0,45654,2,0.280019,0.025736,0.517772,0.144985,0.030124,0.000499,0.000866
1,45655,3,0.000040,0.000025,0.003284,0.996496,0.000024,0.000012,0.000120
2,45656,2,0.263787,0.047058,0.639353,0.005075,0.001633,0.001072,0.042021
3,45657,2,0.429295,0.002638,0.481558,0.079717,0.001769,0.000081,0.004941
4,45658,3,0.000103,0.000007,0.002290,0.997491,0.000095,0.000002,0.000013
...,...,...,...,...,...,...,...,...,...
9126,54780,3,0.000008,0.000010,0.000852,0.999039,0.000007,0.000031,0.000052
9127,54781,2,0.000623,0.000053,0.959072,0.016907,0.002792,0.000102,0.020452
9128,54782,2,0.000370,0.000026,0.877696,0.110855,0.010826,0.000128,0.000099
9129,54783,0,0.364709,0.207981,0.328404,0.032724,0.064118,0.000903,0.001161


In [58]:
submission_1 = submission

In [71]:
submission_1

Unnamed: 0,index,topic_idx,0,1,2,3,4,5,6
0,45654,2,0.280019,0.025736,0.517772,0.144985,0.030124,0.000499,0.000866
1,45655,3,0.000040,0.000025,0.003284,0.996496,0.000024,0.000012,0.000120
2,45656,2,0.263787,0.047058,0.639353,0.005075,0.001633,0.001072,0.042021
3,45657,2,0.429295,0.002638,0.481558,0.079717,0.001769,0.000081,0.004941
4,45658,3,0.000103,0.000007,0.002290,0.997491,0.000095,0.000002,0.000013
...,...,...,...,...,...,...,...,...,...
9126,54780,3,0.000008,0.000010,0.000852,0.999039,0.000007,0.000031,0.000052
9127,54781,2,0.000623,0.000053,0.959072,0.016907,0.002792,0.000102,0.020452
9128,54782,2,0.000370,0.000026,0.877696,0.110855,0.010826,0.000128,0.000099
9129,54783,0,0.364709,0.207981,0.328404,0.032724,0.064118,0.000903,0.001161


In [72]:
import torch.nn.functional as F
z = torch.FloatTensor(flat_predictions_1)

probs_1 = F.softmax(z, dim=1)
probs_1 = (probs_1).tolist()

i, j, k=  [], [], []
m, n, o = [], [], []
l = []
for row in probs_1:
  i.append(row[0])
  j.append(row[1])
  k.append(row[2])
  l.append(row[3])
  m.append(row[4])
  n.append(row[5])
  o.append(row[6])


submission_1['0'] = i
submission_1['1'] = j
submission_1['2'] = k
submission_1['3'] = l
submission_1['4'] = m
submission_1['5'] = n
submission_1['6'] = o
submission_1.topic_idx = preds

submission_1.to_csv('/content/gpt2.csv',index = False)


In [66]:
submission['topic_idx'] = preds
submission.head(20)


Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3
5,45659,3
6,45660,5
7,45661,3
8,45662,4
9,45663,4


In [40]:
submission_1

Unnamed: 0,index,topic_idx,0,1,2,3,4,5,6
0,45654,2,0.280019,0.025736,0.517772,0.144985,0.030124,0.000499,0.000866
1,45655,3,0.000040,0.000025,0.003284,0.996496,0.000024,0.000012,0.000120
2,45656,2,0.263787,0.047058,0.639353,0.005075,0.001633,0.001072,0.042021
3,45657,2,0.429295,0.002638,0.481558,0.079717,0.001769,0.000081,0.004941
4,45658,3,0.000103,0.000007,0.002290,0.997491,0.000095,0.000002,0.000013
...,...,...,...,...,...,...,...,...,...
9126,54780,3,0.000008,0.000010,0.000852,0.999039,0.000007,0.000031,0.000052
9127,54781,2,0.000623,0.000053,0.959072,0.016907,0.002792,0.000102,0.020452
9128,54782,2,0.000370,0.000026,0.877696,0.110855,0.010826,0.000128,0.000099
9129,54783,0,0.364709,0.207981,0.328404,0.032724,0.064118,0.000903,0.001161


In [67]:
submission.to_csv('/content/gpt2_baseline_1.csv',index = False)
# submission.to_csv('bert_baseline_1.csv',index = True)

## Dacon api

In [69]:
!pip install dacon_submit_api-0.0.4-py3-none-any.whl

Processing ./dacon_submit_api-0.0.4-py3-none-any.whl
Installing collected packages: dacon-submit-api
Successfully installed dacon-submit-api-0.0.4


In [70]:
from dacon_submit_api import dacon_submit_api 

result = dacon_submit_api.post_submission_file(
'/content/gpt2_baseline_1.csv', # 파일경로
'6a7de677ffcfef22c5f3456f74335cd485ed812f66ec97268457b7c7af106bc1',  # 개인토큰
'235747', # 대회 id
'Healthy Guys',  # 팀이름
'gpt2_AdamW') # 노트

{'isSubmitted': True, 'detail': 'Success'}
