In [0]:
!pip install transformers

In [0]:
!git clone https://github.com/yuanbit/FinBERT-QA
%cd FinBERT-QA
from src.utils import *

In [0]:
import torch
import pickle
import csv
import regex as re
import pandas as pd

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7f3be570c130>

In [0]:
# Collection of answers - docid, text
collection = pd.read_csv("retriever/collection_cleaned.tsv", sep="\t", header=None)
collection = collection.rename(columns={0: 'docid', 1: 'doc'})
# Questions - qid, text
query_df = pd.read_csv("data/raw/FiQA_train_question_final.tsv", sep="\t")
queries = query_df[['qid', 'question']]

# List of empty docs
empty_docs = load_pickle('data/id_to_text/empty_docs.pickle')

# docid to text mapping
docid_to_text = load_pickle('data/id_to_text/docid_to_text.pickle')
# qid to text mapping
qid_to_text = load_pickle('data/id_to_text/qid_to_text.pickle')

In [0]:
# Load and process dataset
dataset = pd.read_csv("data/raw/FiQA_train_question_doc_final.tsv", sep="\t")
dataset = dataset[["qid", "docid"]]
dataset = dataset[~dataset['docid'].isin(empty_docs)]
dataset['question'] = dataset['qid'].apply(lambda x: qid_to_text[x])
dataset['answer'] = dataset['docid'].apply(lambda x: docid_to_text[x])

In [6]:
dataset.head(5)

Unnamed: 0,qid,docid,question,answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."


In [0]:
def add_ques_token(string):
    question = string + " [SEP] "

    return question

In [0]:
# Concatenate question and answer with a separator
dataset['question'] = dataset['question'].apply(add_ques_token)
dataset['seq'] = dataset['question'] + dataset['answer']
dataset = dataset[['seq']]

dataset.at[17081, "seq"]

"Is it wise to switch investment strategy frequently? [SEP] My super fund and I would say many other funds give you one free switch of strategies per year.  Some suggest you should change from high growth option to a more balance option once you are say about 10 to 15 years from retirement, and then change to a more capital guaranteed option a few years from retirement. This is a more passive approach and has benefits as well as disadvantages. The benefit is that there is not much work involved, you just change your investment option based on your life stage, 2 to 3 times during your lifetime. This allows you to take more risk when you are young to aim for higher returns, take a balanced approach with moderate risk and returns during the middle part of your working life, and take less risk with lower returns (above inflation) during the latter part of your working life. A possible disadvantage of this strategy is you may be in the higher risk/ higher growth option during a market corre

In [0]:
# Write data to file
dataset.to_csv('data/data.txt',index=False,header=False, sep="\t", quoting=csv.QUOTE_NONE)

In [0]:
!git clone https://github.com/huggingface/transformers.git

Cloning into 'transformers'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 23768 (delta 0), reused 9 (delta 0), pack-reused 23749[K
Receiving objects: 100% (23768/23768), 14.22 MiB | 14.48 MiB/s, done.
Resolving deltas: 100% (16827/16827), done.


In [0]:
!mkdir model

In [0]:
cmd = '''python /content/transformers/examples/run_language_modeling.py \
    --train_data_file /content/FinBERT-QA/data/data.txt \
    --output_dir /content/FinBERT-QA/model \ 
    --model_type bert \
    --mlm \
    --model_name_or_path bert-base-uncased \
    --do_train \
    --line_by_line \
    --overwrite_output_dir \
    --num_train_epochs 1 \
    --save_total_limit 2 \
    --block_size 512 \
    --save_steps 2000 \
    --per_gpu_train_batch_size 8 \
    --seed 42'''

In [0]:
%%time
!{cmd}

03/27/2020 12:18:14 - INFO - filelock -   Lock 140196865260568 acquired on /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685.lock
03/27/2020 12:18:14 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpnsdmynxv
Downloading: 100% 361/361 [00:00<00:00, 393kB/s]
03/27/2020 12:18:14 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json in cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
03/27/2020 12:18:14 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/4dad0251492946e18ac39290