# Library Download and Load

In [None]:
!pip install transformers
!pip install pytorch-lightning
!pip install sentencepiece
!pip install bertviz
!pip install plotly
!pip install pyyaml==5.4.1

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 41.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

In [None]:

import json
import pandas as pd
import numpy as np
import torch
torch.cuda.empty_cache()
from pathlib import Path # 파일시스템의 경로를 단순한 문자열이 아닌 객체로 다루게 하는 것
from torch.utils.data import Dataset, DataLoader, SequentialSampler
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from termcolor import colored
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim



from transformers import (
    AdamW,
    RobertaForSequenceClassification,
    RobertaModel,
    RobertaTokenizer,
    RobertaForQuestionAnswering,
    AutoModel,
    AutoTokenizer,
    BertTokenizerFast, AlbertModel,
    AlbertTokenizer,
    AutoModelForQuestionAnswering
)
from tqdm.auto import tqdm
from bertviz import head_view

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import gc


import warnings

warnings.filterwarnings(action='ignore') 
from plotly import graph_objs as go

from collections import Counter
import plotly.express as px

In [None]:
pl.seed_everything(42)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Global seed set to 42


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
import os

DIR = "/content/gdrive/MyDrive/k-digital-goorm-3-korean-mrc/"
TRAIN_SOURCE = os.path.join(DIR, "train.json")
TEST_SOURCE = os.path.join(DIR, "test.json")

with open(TRAIN_SOURCE) as f:
    TRAIN_DATA = json.loads(f.read())



with open(TEST_SOURCE) as f:
    TEST_DATA = json.loads(f.read())

In [None]:
train = pd.DataFrame(columns=['guid' ,'title' ,'news_category', 'context', 'question' ,'answer_start', 'text'])
guid = 0
for data in TRAIN_DATA['data']:
    for paragraphs in data['paragraphs']:
        for line in paragraphs['qas']:
            train.loc[guid, 'context'] = paragraphs['context']
            train.loc[guid, 'question'] = line['question']
            train.loc[guid, 'answer_start'] = line['answers'][0]['answer_start']
            train.loc[guid, 'text'] = line['answers'][0]['text']            
            train.loc[guid, 'title'] = data['title']
            train.loc[guid, 'guid'] = line['guid']
            guid += 1

In [None]:
test = pd.DataFrame(columns=['guid','title' ,'news_category', 'context', 'question' ])
guid = 0
for data in TEST_DATA['data']:
    for paragraphs in data['paragraphs']:
        for line in paragraphs['qas']:
            test.loc[guid, 'context'] = paragraphs['context']
            test.loc[guid, 'question'] = line['question']
            test.loc[guid, 'title'] = data['title']
            test.loc[guid, 'guid'] = line['guid']
            guid += 1

In [None]:
train_ai_hub = pd.read_csv(DIR + 'ai_hub(도서).csv')

In [None]:
train = pd.concat([train, train_ai_hub]).reset_index().drop('index', axis=1)
train = train.reset_index().drop('index', axis=1)

In [None]:
# train = train[:30000]  # v1 
train = train[:50000]  # v2

In [None]:

def get_answers(x):
    start = x[0]
    text = x[1]
    return {
        'answer_start': [start],
        'text': [text]
    }


train['answers'] = train[['answer_start', 'text']].apply(get_answers, axis=1)

In [None]:

train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)

train_df = train_df.reset_index().drop('index', axis=1)
val_df = val_df.reset_index().drop('index', axis=1)

train_df.shape, val_df.shape

((45000, 8), (5000, 8))

In [None]:
max_length = 512

In [None]:
MODEL_NAME = 'kykim/electra-kor-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/620 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336k [00:00<?, ?B/s]

In [None]:
train_df.iloc[10]

guid                              f90b73569bcf41e2a25a2487fdc14320
title                                 사외이사의 겸임 요청에도...김정태 회장 끝내 고사
news_category                                                  NaN
context          지난달 13일 하나·외환은행의 조기 합병이 발표된 뒤 한 달 넘게 금융권에선 누가 ...
question                                       김정태가 등기이사로 선임된 날짜는?
answer_start                                                    89
text                                                           23일
answers                    {'answer_start': [89], 'text': ['23일']}
Name: 10, dtype: object

In [None]:
example = train_df.iloc[11]
tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=max_length,
        stride=50, 
        return_overflowing_tokens=True, # 길이를 넘어가는 토큰들을 반환할 것인지
        return_offsets_mapping=True,  # 각 토큰에 대해 (char_start, char_end) 정보를 반환한 것인지
        padding="max_length",
    )

In [None]:
def prepare_train_features(example, tokenizer):
    example["question"] = example["question"].lstrip()
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=max_length,
        stride=50, 
        return_overflowing_tokens=True, # 길이를 넘어가는 토큰들을 반환할 것인지
        return_offsets_mapping=True,  # 각 토큰에 대해 (char_start, char_end) 정보를 반환한 것인지
        padding="max_length",
    )

    sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_example.pop("offset_mapping")

    features = []
    for i, offsets in enumerate(offset_mapping):
        feature = {}

        input_ids = tokenized_example["input_ids"][i]
        attention_mask = tokenized_example["attention_mask"][i]

        feature['input_ids'] = input_ids
        feature['attention_mask'] = attention_mask
        feature['offset_mapping'] = offsets

        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_example.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = example["answers"]

        if len(answers["answer_start"]) == 0:
            feature["start_position"] = cls_index
            feature["end_position"] = cls_index
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                feature["start_position"] = cls_index
                feature["end_position"] = cls_index
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                feature["start_position"] = token_start_index - 1
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                feature["end_position"] = token_end_index + 1

        features.append(feature)
    return features

In [None]:
train_features, valid_features = [[] for _ in range(2)]
for i, row in tqdm(train_df.iterrows()):
    train_features += prepare_train_features(row, tokenizer)
for i, row in val_df.iterrows():
    valid_features += prepare_train_features(row, tokenizer)

0it [00:00, ?it/s]

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        if self.mode == 'train':
            return {
                'input_ids':torch.tensor(feature['input_ids'] ,dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(feature['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position':torch.tensor(feature['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

In [None]:
train_dataset = DatasetRetriever(train_features)

In [None]:
for data in train_dataset:
  print("Input ids: ", data['input_ids'][:10])
  print("start_positions: ", data['start_position'])
  print("end_positions: ", data['end_position'])
  break

Input ids:  tensor([    2,  7294, 31318,  6928,  4681,  4741,  2585,  7102,  4584,  9651])
start_positions:  tensor(139)
end_positions:  tensor(139)


In [None]:
class KorQuadDataModule(pl.LightningDataModule):
  def __init__(
      self, 
      train_features,
      valid_features,
      tokenizer: tokenizer,
      batch_size: int = 4,
      source_max_token_len: int = 1024
  ):
    super().__init__()
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.source_max_token_len = source_max_token_len

  def setup(self, stage=None):

    self.train_dataset = DatasetRetriever(train_features)
    self.valid_dataset = DatasetRetriever(valid_features)
    

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size=self.batch_size,
        shuffle=True,
        num_workers=4
      
    )
  def val_dataloader(self):
    return DataLoader(
        self.valid_dataset,
        batch_size=self.batch_size,
        num_workers=4
    )
  def test_dataloader(self):
    return DataLoader(
        self.valid_dataset,
        batch_size=self.batch_size,
        num_workers=4 

    )

In [None]:

BATCH_SIZE =4
N_EPOCHS = 10
data_module = KorQuadDataModule(train_features, valid_features, tokenizer, batch_size=BATCH_SIZE)
data_module.setup() 


In [None]:
class KorQuadModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.xlm_roberta  = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME, output_hidden_states=True)
      
  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      module.weight.data.normal_(mean=0.0, std=0.02)
      if module.bias is not None:
        module.bias.data.zero_()

  def forward(self, input_ids, attention_mask):
    output = self.xlm_roberta(input_ids, attention_mask)
    start_logits = output.start_logits
    end_logits = output.end_logits
    return start_logits, end_logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_label = batch['start_position'].to(device)
    end_label = batch['end_position'].to(device)
    
    start_pred, end_pred = self(input_ids, attention_mask)

    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(start_pred, start_label)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_pred, end_label)
    loss = (start_loss + end_loss) / 2
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_label = batch['start_position'].to(device)
    end_label = batch['end_position'].to(device)
    
    start_pred, end_pred = self(input_ids, attention_mask)

    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(start_pred, start_label)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_pred, end_label)
    loss = (start_loss + end_loss) / 2
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss


  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_label = batch['start_position'].to(device)
    end_label = batch['end_position'].to(device)
    
    start_pred, end_pred = self(input_ids, attention_mask)

    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(start_pred, start_label)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_pred, end_label)
    loss = (start_loss + end_loss) / 2
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def predict_step(self, batch, batch_idx, dataloader_idx=0):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    return self(input_ids, attention_mask)


  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=0.00005) # 0.0001 -> 0.00005
    return optimizer

In [None]:
model = KorQuadModel()

Downloading:   0%|          | 0.00/870 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'bert.pooler.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'bert.pooler.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BigBirdForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of 

In [None]:
logger = TensorBoardLogger("lightning_logs", name='Korquad_log')
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

checkpoint_callback = ModelCheckpoint(
    filename="/content/gdrive/MyDrive/data/model",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min")

In [None]:
trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback],
    max_epochs=10,
    gpus=1,
    progress_bar_refresh_rate=30)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, data_module)

Missing logger folder: lightning_logs/Korquad_log
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                        | Params
------------------------------------------------------------
0 | xlm_roberta | BigBirdForQuestionAnswering | 117 M 
------------------------------------------------------------
117 M     Trainable params
0         Non-trainable params
117 M     Total params
471.555   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 11293: 'val_loss' reached 5.71004 (best 5.71004), saving model to '/content/gdrive/MyDrive/data/model.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 22586: 'val_loss' was not in top 1


In [None]:

%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:

trained_model = KorQuadModel().load_from_checkpoint("/content/gdrive/MyDrive/k-digital-goorm-3-korean-mrc/model.ckpt")
trained_model.freeze()

In [None]:
gc.collect()

2743

## Inference

In [None]:

def prepare_test_features(example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=512,
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example['guid']
        feature['context'] = example['context']
        feature['question'] = example['question']
        feature['input_ids'] = tokenized_example['input_ids'][i]
        feature['attention_mask'] = tokenized_example['attention_mask'][i]
        feature['offset_mapping'] = tokenized_example['offset_mapping'][i]
        feature['sequence_ids'] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)
    return features

In [None]:
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 20):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["guid"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["guid"]] = best_answer["text"]
        
        
    return predictions

In [None]:

test_features = []
for i, row in test.iterrows():
    test_features += prepare_test_features(row, tokenizer)

In [None]:

test_dataset = DatasetRetriever(test_features, mode='test')
test_dataloader = DataLoader(
    test_dataset,
    batch_size=4, 
    sampler=SequentialSampler(test_dataset),
    num_workers=4,
    pin_memory=True, 
    drop_last=False
)

In [None]:
def get_predictions(trained_model):
    model = trained_model.to(device)
    
    start_logits = []
    end_logits = []
    for batch in tqdm(test_dataloader):
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

In [None]:
start_logits, end_logits = get_predictions(trained_model=model)
# Now Calling Function 

fin_preds = postprocess_qa_predictions(test, test_features, (start_logits, end_logits))

  0%|          | 0/1532 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Post-processing 4008 example predictions split into 6127 features.


In [None]:
from string import punctuation
submission = []
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
    submission.append((p1, p2))
    
sample = pd.DataFrame(submission, columns=["id", "Predicted"])

In [None]:
sample.to_csv('/content/gdrive/MyDrive/k-digital-goorm-3-korean-mrc/out/'+'BIGBIRD_with_ai_hub.csv', index=False)
sample = sample.rename({'id':'guid'}, axis='columns')

test_data =pd.merge(left=test,right=sample,on='guid')

In [None]:
test_data

In [None]:
test_data.head(15)

## validation data 에서 성능테스트

In [None]:

val_features = []
for i, row in val_df.iterrows():
    val_features += prepare_test_features(row, tokenizer)

In [None]:
val_test_dataset = DatasetRetriever(val_features, mode='test')
val_test_dataloader = DataLoader(
    val_test_dataset,
    batch_size=4, 
    sampler=SequentialSampler(val_test_dataset),
    num_workers=4,
    pin_memory=True, 
    drop_last=False
)

In [None]:
def get_val_predictions(trained_model):
    model = trained_model.to(device)
    
    start_logits = []
    end_logits = []
    for batch in tqdm(val_test_dataloader):
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

In [None]:
start_logits, end_logits = get_val_predictions(trained_model=trained_model)
# Now Calling Function 

fin_preds = postprocess_qa_predictions(val_df, val_features, (start_logits, end_logits))

In [None]:
from string import punctuation
submission = []
for p1, p2 in fin_preds.items():
    p2 = " ".join(p2.split())
    p2 = p2.strip(punctuation)
    submission.append((p1, p2))
    
sample = pd.DataFrame(submission, columns=["id", "Predicted"])


In [None]:
val_df['Predicted'] = sample['Predicted']