# Data preparation

## Setup

In [35]:
## Install required packages
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
!pip install transformers==4.21.2
!pip install sentencepiece==0.1.96
!pip install numpy requests nlpaug

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
     -------------------------------------- 410.5/410.5 kB 2.6 MB/s eta 0:00:00
Collecting gdown>=4.0.0
  Downloading gdown-4.5.1.tar.gz (14 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml): started
  Building wheel for gdown (pyproject.toml): finished with status 'done'
  Created wheel for gdown: filename=gdown-4.5.1-py3-none-any.whl size=14933 sha256=e1b4b55e6c082c672c6a7d982c445e4c55706116cd89e9c14078a068a588eb14
  Stored in directory: c:\users\hung\appdata\loca

In [40]:
import os
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support, classification_report
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
import torch
import transformers
from transformers import BertForSequenceClassification, BertTokenizerFast, BertModel
from torch.utils.data import DataLoader
from utils import functions, models, dataset    

In [2]:
device = torch.device('cuda')
torch.cuda.get_device_name(device=None)


'NVIDIA GeForce GTX 1060 6GB'

In [None]:
## If using on Google Colab, run this cell
from google.colab import drive
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Load data and clean

In [3]:
cap_deu = pd.read_csv(os.path.join("shared_data", "CAP","cap_data_cleaned.csv"), encoding="utf-8")

In [4]:
cap_deu.head(5)

Unnamed: 0.1,Unnamed: 0,election_year,party_name,party_id,manifesto_id,sentence_id,cap_topic,cap_subtopic,header,junk,eu_dummy,sentence_text
0,1,1949,SPD,1,19491,1,99,99,1,0,0,Für ein freies Deutschland
1,2,1949,SPD,1,19491,2,99,99,0,0,0,Wählerinnen und Wähler!
2,3,1949,SPD,1,19491,3,99,99,0,0,0,"Am 14. August entscheidet XX darüber, wie ein ..."
3,4,1949,SPD,1,19491,4,20,2099,0,0,0,Heute ist unser land geteilt.
4,5,1949,SPD,1,19491,5,20,2099,0,0,0,Swjetrussland hat seine Besatzungszone separiert.


In [5]:
cap_deu.groupby('cap_topic').count()

Unnamed: 0_level_0,Unnamed: 0,election_year,party_name,party_id,manifesto_id,sentence_id,cap_subtopic,header,junk,eu_dummy,sentence_text
cap_topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2,2,2,2,2,2,2,2,2,2,2
1,7127,7127,7127,7127,7127,7127,7127,7127,7127,7127,7127
2,5094,5094,5094,5094,5094,5094,5094,5094,5094,5094,5094
3,3014,3014,3014,3014,3014,3014,3014,3014,3014,3014,3014
4,1658,1658,1658,1658,1658,1658,1658,1658,1658,1658,1658
5,4673,4673,4673,4673,4673,4673,4673,4673,4673,4673,4673
6,4213,4213,4213,4213,4213,4213,4213,4213,4213,4213,4213
7,3461,3461,3461,3461,3461,3461,3461,3461,3461,3461,3461
8,2329,2329,2329,2329,2329,2329,2329,2329,2329,2329,2329
9,1621,1621,1621,1621,1621,1621,1621,1621,1621,1621,1621


In [6]:
## Remove rows with topic = 0,21,23,25,61 because of too few data points
cap_deu = cap_deu[~cap_deu.cap_topic.isin([0,21,23,25,61])]
cap_deu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73830 entries, 0 to 75189
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     73830 non-null  int64 
 1   election_year  73830 non-null  int64 
 2   party_name     73830 non-null  object
 3   party_id       73830 non-null  int64 
 4   manifesto_id   73830 non-null  int64 
 5   sentence_id    73830 non-null  int64 
 6   cap_topic      73830 non-null  int64 
 7   cap_subtopic   73830 non-null  int64 
 8   header         73830 non-null  int64 
 9   junk           73830 non-null  int64 
 10  eu_dummy       73830 non-null  int64 
 11  sentence_text  73830 non-null  object
dtypes: int64(10), object(2)
memory usage: 7.3+ MB


In [7]:
cap_deu['cap_topic_new'] = cap_deu['cap_topic'].apply(functions.recode_topic)

In [8]:
cap_deu.groupby('cap_topic_new').count()

Unnamed: 0_level_0,Unnamed: 0,election_year,party_name,party_id,manifesto_id,sentence_id,cap_topic,cap_subtopic,header,junk,eu_dummy,sentence_text
cap_topic_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,11494,11494,11494,11494,11494,11494,11494,11494,11494,11494,11494,11494
1,5094,5094,5094,5094,5094,5094,5094,5094,5094,5094,5094,5094
2,3014,3014,3014,3014,3014,3014,3014,3014,3014,3014,3014,3014
3,1658,1658,1658,1658,1658,1658,1658,1658,1658,1658,1658,1658
4,10481,10481,10481,10481,10481,10481,10481,10481,10481,10481,10481,10481
5,4213,4213,4213,4213,4213,4213,4213,4213,4213,4213,4213,4213
6,3461,3461,3461,3461,3461,3461,3461,3461,3461,3461,3461,3461
7,2329,2329,2329,2329,2329,2329,2329,2329,2329,2329,2329,2329
8,1621,1621,1621,1621,1621,1621,1621,1621,1621,1621,1621,1621
9,2139,2139,2139,2139,2139,2139,2139,2139,2139,2139,2139,2139


## Adding metadata and speakers' background to the model

In [9]:
## Sentence length
cap_deu.loc[:, 'length'] = cap_deu.sentence_text.str.split().str.len() ## sentence length


In [12]:
set(cap_deu.party_name)

{'CDU', 'Die Linke', 'FDP', 'Grünen', 'PDS', 'SPD'}

In [10]:
cap_deu['party'] = cap_deu['party_name'].apply(functions.party_new)

In [14]:
set(cap_deu.party)

{'CDU/CSU', 'FDP', 'GRUENEN', 'PDS/DIE LINKE', 'SPD'}

In [15]:
from sklearn.preprocessing import OneHotEncoder
arr_party = np.reshape(np.array(cap_deu['party']), (-1,1))
encoder = OneHotEncoder(sparse=False)
encoded_party_name = encoder.fit(arr_party).categories_[0].tolist()
encoded_party_data = encoder.fit_transform(arr_party)

In [16]:
df_toappend_1 = pd.DataFrame(encoded_party_data, columns = encoded_party_name)
cap_deu =  pd.concat([cap_deu.reset_index(drop=True),df_toappend_1.reset_index(drop=True)], axis=1)

In [17]:
cap_deu.head()

Unnamed: 0.1,Unnamed: 0,election_year,party_name,party_id,manifesto_id,sentence_id,cap_topic,cap_subtopic,header,junk,eu_dummy,sentence_text,cap_topic_new,length,party,CDU/CSU,FDP,GRUENEN,PDS/DIE LINKE,SPD
0,1,1949,SPD,1,19491,1,99,99,1,0,0,Für ein freies Deutschland,16,4,SPD,0.0,0.0,0.0,0.0,1.0
1,2,1949,SPD,1,19491,2,99,99,0,0,0,Wählerinnen und Wähler!,16,3,SPD,0.0,0.0,0.0,0.0,1.0
2,3,1949,SPD,1,19491,3,99,99,0,0,0,"Am 14. August entscheidet XX darüber, wie ein ...",16,17,SPD,0.0,0.0,0.0,0.0,1.0
3,4,1949,SPD,1,19491,4,20,2099,0,0,0,Heute ist unser land geteilt.,15,5,SPD,0.0,0.0,0.0,0.0,1.0
4,5,1949,SPD,1,19491,5,20,2099,0,0,0,Swjetrussland hat seine Besatzungszone separiert.,15,5,SPD,0.0,0.0,0.0,0.0,1.0


In [18]:
cap_deu['gov_last'] = cap_deu.apply(lambda x: functions.gov_last(x['party'], x['election_year']), axis =1)

In [19]:
set(cap_deu.loc[(cap_deu.election_year == 1972) & (cap_deu.party == 'CDU/CSU'), 'gov_last'])

{0}

In [20]:
cap_deu['opp_last'] = cap_deu.apply(lambda x: functions.opp_last(x['party'], x['election_year']), axis =1)

In [21]:
set(cap_deu.loc[(cap_deu.election_year == 1972) & (cap_deu.party == 'CDU/CSU'), 'opp_last'])

{1}

In [22]:
from sklearn.preprocessing import OrdinalEncoder
arr_years = np.reshape(np.array(cap_deu['election_year']), (-1,1))
encoder = OrdinalEncoder()
encoded_year_data = encoder.fit_transform(arr_years)
cap_deu['year_recoded'] = encoded_year_data

In [24]:
from sklearn.preprocessing import StandardScaler
arr_senlen = np.reshape(np.array(cap_deu['length']),(-1,1))
encoder = StandardScaler()
encoded_senlen = encoder.fit_transform(arr_senlen)
cap_deu['length_recoded'] = encoded_senlen

In [25]:
len(set(cap_deu['year_recoded']))

18

## Create datasets and dataloaders

In [27]:
cap_deu.head()

Unnamed: 0.1,Unnamed: 0,election_year,party_name,party_id,manifesto_id,sentence_id,cap_topic,cap_subtopic,header,junk,...,party,CDU/CSU,FDP,GRUENEN,PDS/DIE LINKE,SPD,gov_last,opp_last,year_recoded,length_recoded
0,1,1949,SPD,1,19491,1,99,99,1,0,...,SPD,0.0,0.0,0.0,0.0,1.0,0,0,0.0,-1.311245
1,2,1949,SPD,1,19491,2,99,99,0,0,...,SPD,0.0,0.0,0.0,0.0,1.0,0,0,0.0,-1.426313
2,3,1949,SPD,1,19491,3,99,99,0,0,...,SPD,0.0,0.0,0.0,0.0,1.0,0,0,0.0,0.184644
3,4,1949,SPD,1,19491,4,20,2099,0,0,...,SPD,0.0,0.0,0.0,0.0,1.0,0,0,0.0,-1.196177
4,5,1949,SPD,1,19491,5,20,2099,0,0,...,SPD,0.0,0.0,0.0,0.0,1.0,0,0,0.0,-1.196177


In [28]:
bert_model = 'deepset/gbert-base'
tokenizer = BertTokenizerFast.from_pretrained(bert_model)

In [30]:
df_train_new, df_test_new = train_test_split(cap_deu, test_size=0.2, random_state=1234, stratify = cap_deu['cap_topic_new'])
df_train_new, df_eval_new = train_test_split(df_train_new, test_size=0.3, random_state=1234, stratify = df_train_new['cap_topic_new'])
df_train, df_test = train_test_split(cap_deu, test_size=0.2, random_state=1234, stratify = cap_deu['cap_topic'])
df_train, df_eval = train_test_split(df_train, test_size=0.3, random_state=1234, stratify = df_train['cap_topic'])

In [31]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()
df_eval = df_eval.reset_index()
df_train_new = df_train_new.reset_index()
df_test_new = df_test_new.reset_index()
df_eval_new = df_eval_new.reset_index()

In [32]:
df_train.groupby('cap_topic_new').count()

Unnamed: 0_level_0,index,Unnamed: 0,election_year,party_name,party_id,manifesto_id,sentence_id,cap_topic,cap_subtopic,header,...,party,CDU/CSU,FDP,GRUENEN,PDS/DIE LINKE,SPD,gov_last,opp_last,year_recoded,length_recoded
cap_topic_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6437,6437,6437,6437,6437,6437,6437,6437,6437,6437,...,6437,6437,6437,6437,6437,6437,6437,6437,6437,6437
1,2852,2852,2852,2852,2852,2852,2852,2852,2852,2852,...,2852,2852,2852,2852,2852,2852,2852,2852,2852,2852
2,1688,1688,1688,1688,1688,1688,1688,1688,1688,1688,...,1688,1688,1688,1688,1688,1688,1688,1688,1688,1688
3,928,928,928,928,928,928,928,928,928,928,...,928,928,928,928,928,928,928,928,928,928
4,5869,5869,5869,5869,5869,5869,5869,5869,5869,5869,...,5869,5869,5869,5869,5869,5869,5869,5869,5869,5869
5,2360,2360,2360,2360,2360,2360,2360,2360,2360,2360,...,2360,2360,2360,2360,2360,2360,2360,2360,2360,2360
6,1938,1938,1938,1938,1938,1938,1938,1938,1938,1938,...,1938,1938,1938,1938,1938,1938,1938,1938,1938,1938
7,1304,1304,1304,1304,1304,1304,1304,1304,1304,1304,...,1304,1304,1304,1304,1304,1304,1304,1304,1304,1304
8,908,908,908,908,908,908,908,908,908,908,...,908,908,908,908,908,908,908,908,908,908
9,1198,1198,1198,1198,1198,1198,1198,1198,1198,1198,...,1198,1198,1198,1198,1198,1198,1198,1198,1198,1198


In [33]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

In [34]:
augmenter = naw.ContextualWordEmbsAug(model_path=bert_model, model_type = 'bert', device='cuda')

In [36]:
list_cols = ['cap_topic_new','SPD', 'CDU/CSU', 'FDP', 'PDS/DIE LINKE', 'GRUENEN', 'gov_last', 'opp_last', 'year_recoded']

In [None]:
texts, vars = functions.augment_iter(list_cols = list_cols, dataset = df_train, text_col = 'sentence_text', n_aug = 3, augmenter=augmenter)

In [None]:
df_train_aug = pd.DataFrame(vars, columns = list_cols)
df_train_aug['sentence_text'] = texts

In [None]:
df_train_aug.loc[:,'length'] = df_train_aug.sentence_text.str.split().str.len()
arr_senlen = np.reshape(np.array(df_train_aug['length']),(-1,1))
encoder = StandardScaler()
encoded_senlen = encoder.fit_transform(arr_senlen)
df_train_aug['length_recoded'] = encoded_senlen

In [None]:
list_cols = ['cap_topic','SPD', 'CDU/CSU', 'FDP', 'PDS/DIE LINKE', 'GRUENEN', 'gov_last', 'opp_last', 'year_recoded']

In [None]:
texts, vars = functions.augment_iter(list_cols = list_cols, dataset = df_train_new, text_col = 'sentence_text', n_aug = 3, augmenter=augmenter)

In [None]:
df_train_new_aug = pd.DataFrame(vars, columns = list_cols)
df_train_new_aug['sentence_text'] = texts

In [None]:
df_train_new_aug.loc[:,'length'] = df_train_new_aug.sentence_text.str.split().str.len()
arr_senlen = np.reshape(np.array(df_train_new_aug['length']),(-1,1))
encoder = StandardScaler()
encoded_senlen = encoder.fit_transform(arr_senlen)
df_train_new_aug['length_recoded'] = encoded_senlen

In [None]:
list_extra_dims = ['SPD', 'CDU/CSU', 'FDP', 'PDS/DIE LINKE', 'GRUENEN', 'gov_last', 'opp_last', 'length_recoded']

In [None]:
train_meta_dataset = dataset.CustomTextDataset(df_train, list_extra_dims)
train_meta_dataset_aug = dataset.CustomTextDataset(df_train_aug, list_extra_dims)
test_meta_dataset = dataset.CustomTextDataset(df_test, list_extra_dims)
eval_meta_dataset = dataset.CustomTextDataset(df_eval, list_extra_dims)

In [None]:
train_dataset = dataset.BareDataset(df_train)
train_dataset_aug = dataset.BareDataset(df_train_aug)
test_dataset = dataset.BareDataset(df_test)
eval_dataset = dataset.BareDataset(df_eval)

In [None]:
train_meta_dataset_new = dataset.CustomTextDataset(df_train_new, list_extra_dims)
train_meta_dataset_new_aug = dataset.CustomTextDataset(df_train_new_aug, list_extra_dims)
test_meta_dataset_new = dataset.CustomTextDataset(df_test_new, list_extra_dims)
eval_meta_dataset_new = dataset.CustomTextDataset(df_eval_new, list_extra_dims)

In [None]:
train_dataset_new = dataset.BareDataset(df_train_new)
train_dataset_new_aug = dataset.BareDataset(df_train_new_aug)
test_dataset_new = dataset.BareDataset(df_test_new)
eval_dataset_new = dataset.BareDataset(df_eval_new)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_dataloader_aug =  DataLoader(train_dataset_aug, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=16, shuffle=True)

In [None]:
train_meta_dataloader = DataLoader(train_meta_dataset, batch_size=16, shuffle=True)
train_meta_dataloader_aug =  DataLoader(train_meta_dataset_aug, batch_size=16, shuffle=True)
test_meta_dataloader = DataLoader(test_meta_dataset, batch_size=16, shuffle=True)
eval_meta_dataloader = DataLoader(eval_meta_dataset, batch_size=16, shuffle=True)

In [None]:
train_dataloader_new = DataLoader(train_dataset_new, batch_size=16, shuffle=True)
train_dataloader_new_aug =  DataLoader(train_dataset_new_aug, batch_size=16, shuffle=True)
test_dataloader_new = DataLoader(test_dataset_new, batch_size=16, shuffle=True)
eval_dataloader_new = DataLoader(eval_dataset_new, batch_size=16, shuffle=True)

In [None]:
train_meta_dataloader = DataLoader(train_meta_dataset_new, batch_size=16, shuffle=True)
train_meta_dataloader_new_aug =  DataLoader(train_meta_dataset_new_aug, batch_size=16, shuffle=True)
test_meta_dataloader_new = DataLoader(test_meta_dataset_new, batch_size=16, shuffle=True)
eval_meta_dataloader_new = DataLoader(eval_meta_dataset_new, batch_size=16, shuffle=True)

In [39]:
model_base = models.NormalBERT(bert_model, labels_count=17).to(device)
model_meta = models.MetaBERT(bert_model, labels_count=17, extra_dim=8, hidden_dim=20).to(device)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

# Train-test loops

In [None]:
import random
seed_val = 1670
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


## Normal BERT, no augmentation, standard topics

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
epochs = 5
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
optimizer = torch.optim.Adam(model_base.parameters(), lr=0.0000525)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min')
loss_fn = nn.CrossEntropyLoss()

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    functions.train_normal(train_dataloader, model_base, loss_fn, optimizer, scheduler)
    functions.test_normal(eval_dataloader, model_base, loss_fn)
print("Done!")

## Normal BERT, no augmentation, recoded topics

## Normal BERT, with augmentation, standard topics

## Normal BERT, with augmentation, recoded topics

## Meta BERT, no augmentation, standard topics

## Meta BERT, no augmentation, recoded topics

## Meta BERT, with augmentation, standard topics

## Meta BERT, with augmentation, recoded topics

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
epochs = 5
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
optimizer = torch.optim.Adam(model.parameters(), lr=0.0000525)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min')
loss_fn = nn.CrossEntropyLoss()


In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer, scheduler)
    eval_loop(eval_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------

Training...
loss: 2.794420  [    0/30601]. Took 0:00:01
loss: 2.662218  [  800/30601]. Took 0:00:34
loss: 2.548109  [ 1600/30601]. Took 0:01:07
loss: 2.495279  [ 2400/30601]. Took 0:01:39
loss: 2.700026  [ 3200/30601]. Took 0:02:12
loss: 2.603777  [ 4000/30601]. Took 0:02:45
loss: 2.168311  [ 4800/30601]. Took 0:03:18
loss: 2.428246  [ 5600/30601]. Took 0:03:51
loss: 2.345435  [ 6400/30601]. Took 0:04:24
loss: 1.979844  [ 7200/30601]. Took 0:04:57
loss: 1.996744  [ 8000/30601]. Took 0:05:29
loss: 1.968703  [ 8800/30601]. Took 0:06:02
loss: 1.770010  [ 9600/30601]. Took 0:06:35
loss: 1.980413  [10400/30601]. Took 0:07:08
loss: 2.295319  [11200/30601]. Took 0:07:41
loss: 1.491683  [12000/30601]. Took 0:08:14
loss: 1.995092  [12800/30601]. Took 0:08:47
loss: 2.177581  [13600/30601]. Took 0:09:19
loss: 1.684760  [14400/30601]. Took 0:09:52
loss: 1.993751  [15200/30601]. Took 0:10:25
loss: 2.154440  [16000/30601]. Took 0:10:58
loss: 1.448552  [16800/

In [None]:
output = torch.full([10, 64], 1.5)
target = torch.ones([10, 64], dtype=torch.float32)  


In [None]:
target

In [None]:
set(cap_deu['year_recoded'])

In [None]:
torch.cuda.empty_cache()

In [None]:
ls_res = []
ls_y = []
with torch.no_grad():
  for item in test_dataloader:
    list_keys = [x for i,x in enumerate(list(item)) if i not in [0,1,2,len(list(item))-1, len(list(item))-2]]
    extras =  torch.cat(tuple(item[key] for key in list_keys), dim=1).to(device)
    output = model(input_ids = item['input_ids'], attention_mask = item['attention_mask'], token_type_ids = item['token_type_ids'], extras = extras, year=item['year'])
    argmax = output.argmax(-1).detach().cpu().numpy()
    ls_res.append(argmax)
    y = item['labels'].detach().cpu().numpy()
    ls_y.append(y)

In [None]:
ls_y_1 = np.concatenate(ls_y).tolist()
ls_res_1 = np.concatenate(ls_res).tolist()

In [None]:
 precision, recall, f1, n = precision_recall_fscore_support(ls_res_1, ls_y_1, average=None)

In [None]:
f1

In [None]:
model_path = os.path.join('MetaBERT', 'results', 'model.pth')
torch.save(model.state_dict(), model_path)

In [None]:
model_path = os.path.join('MetaBERT', 'results', 'model.pth')
model.load_state_dict(torch.load(model_path))

In [None]:
cap_deu_9418_econsoc = cap_deu_9418_long[cap_deu_9418_long.cap_topic_new.isin([0,4])] ## Economics + Labour and Social Welfare
cap_deu_9418_3000_long = cap_deu_9418_long[cap_deu_9418_long.cap_topic_new.isin([1,14])] # 2800+ to 3000+
cap_deu_9418_u2500_long = cap_deu_9418_long[cap_deu_9418_long.cap_topic_new.isin([3,5,8,10,12,11,9,13,7,2,6,16,15])] # under 2500

In [None]:
cap_deu_9418_econsoc = cap_deu_9418_econsoc.groupby('cap_topic_new', as_index = False,group_keys=False).apply(lambda s: s.sample(5500,replace=True, random_state = 1234))
cap_deu_9418_3000_long = cap_deu_9418_3000_long.groupby('cap_topic_new', as_index = False,group_keys=False).apply(lambda s: s.sample(2800,replace=True, random_state = 1234))

In [None]:
frames_long = [cap_deu_9418_econsoc, cap_deu_9418_3000_long, cap_deu_9418_u2500_long]
df_long = pd.concat(frames_long)
df_long.groupby('cap_topic_new').count()

In [None]:
len(df_long)

In [None]:
## export labels to a list
labels = df_long['cap_topic_new'].tolist()
set(labels)

In [None]:
texts = df_long['sentence_text'].tolist()

In [None]:
min_length = min(len(sentence.split()) for sentence in texts)
max_length = max(len(sentence.split()) for sentence in texts)

print('Min length (word) is: {}'.format(min_length))
print('Min length (word) is: {}'.format(max_length))


In [None]:
## train test split
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size = 0.25, random_state = 1234, stratify = labels)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.4, random_state=4321, stratify=train_labels)

In [None]:
## Load tokenizer and models
bert_version = 'deepset/gbert-base'
model = BertForSequenceClassification.from_pretrained(bert_version, num_labels = 17)
tokenizer = BertTokenizerFast.from_pretrained(bert_version)
model = model.to(device)

In [None]:
## tokenize 
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True)
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True,  padding=True)
test_encodings = tokenizer.batch_encode_plus(test_texts, truncation=True,  padding=True)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, n = precision_recall_fscore_support(labels, preds, average=None)
    cf = confusion_matrix(labels,preds)
    cf_n = cf.astype('float') / cf.sum(axis=1)[:, np.newaxis]
    acc=cf_n.diagonal() ## accuracy for each class
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'n': n,
        'cf': cf,
        'acc': acc
    }

In [None]:
## Fine tuning 
training_args = TrainingArguments(  
    # output directory
    output_dir=os.path.join("Hung's paper", "results", "Deu9418_Long"),
    # total number of training epochs
    num_train_epochs= 16,
    # batch size per device during training
    per_device_train_batch_size=16,
    # batch size for evaluation
    per_device_eval_batch_size=16,
    # number of warmup steps for learning rate scheduler
    warmup_steps=500,
    weight_decay = 0.01,
    # learning rate
    learning_rate = 2e-5,  
    # directory for storing logs
    logging_dir=os.path.join("Hung's paper", "logs", "Deu9418_Long"),            
    logging_steps= 1000,    
    load_best_model_at_end=True,
    save_strategy = "no"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
)

In [None]:
trainer.train()

In [None]:
eval_res = trainer.evaluate()

In [None]:
evaluated = pd.DataFrame()
evaluated["f1"] = eval_res["eval_f1"]
evaluated["precision"] = eval_res["eval_precision"]
evaluated["recall"] = eval_res["eval_recall"]
evaluated["n"] = eval_res["eval_n"]
evaluated["accuracy"] = eval_res["eval_acc"]
list_index = evaluated.index.tolist()
evaluated['pred_topics'] = list_index
evaluated['topic'] = lab_to_top(evaluated)

In [None]:
evaluated = evaluated.sort_values(by="f1", ascending=False)
evaluated

In [None]:
evaluated.to_csv("classification_res_trimmed.csv")

In [None]:
model_path = os.path.join("Hung's paper", "results", "trimmed")
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Predictions


In [None]:
## Load pre-trained models
model_path = os.path.join("Hung's paper", "results", "final")
model = BertForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [None]:
germaparl = pd.read_csv(os.path.join("Hung's paper", "germaparl.csv"), encoding='utf-8')
germaparl = germaparl.dropna(subset=['text'])


In [None]:
germaparl.head(5)

In [None]:
pred_texts = germaparl['text'].tolist()
len(pred_texts)

In [None]:
from statistics import stdev, mean
## Before
seq_len = [len(i.split()) for i in pred_texts]
seq_len_mean = mean(seq_len)
seq_len_std = stdev(seq_len)
seq_len_max = max(seq_len)
seq_len_min = min(seq_len)
print('Mean length (word) is: {}'.format(seq_len_mean))
print('Std length (word) is: {}'.format(seq_len_std))
print('Min length (word) is: {}'.format(seq_len_min))
print('Max length (word) is: {}'.format(seq_len_max))


In [None]:
pd.Series(seq_len).hist(bins = 30)

In [None]:
## This piece of code takes a lot of RAM. If you can't access high-ram session, consider doing this in R or using iterrows instead (much slower).
germaparl.loc[:, 'length'] = germaparl.text.str.split().str.len()


In [None]:
germaparl = germaparl[germaparl.length.between(10,256)]


In [None]:
pred_texts = germaparl['text'].tolist()
len(pred_texts)

In [None]:
germaparl.head(5)

In [None]:
## After
seq_len = [len(i.split()) for i in pred_texts]
seq_len_mean = mean(seq_len)
seq_len_std = stdev(seq_len)
seq_len_max = max(seq_len)
seq_len_min = min(seq_len)
print('Mean length (word) is: {}'.format(seq_len_mean))
print('Std length (word) is: {}'.format(seq_len_std))
print('Min length (word) is: {}'.format(seq_len_min))
print('Max length (word) is: {}'.format(seq_len_max))

In [None]:
pd.Series(seq_len).hist(bins = 30)

In [None]:
def get_prediction(text):
  res = []
  with torch.no_grad():
    for x in text:
      inputs = tokenizer(x, padding=True, truncation=True, return_tensors="pt").to(device)
      # perform inference to our model
      outputs = model(**inputs)
      # get output probabilities by doing softmax
      probs = outputs[0].softmax(1)
      argmax = probs.argmax()
      res.append(argmax)
  return res

In [None]:
text_test = pred_texts[0:10]
res = get_prediction(text_test)
print(res)


In [None]:
pred_texts[4]

In [None]:
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out
pred_texts_chunk = chunkIt(pred_texts, 1000)
len(pred_texts_chunk[0])

In [None]:
len(pred_texts_chunk[1])

In [None]:
def iter_pred(obj, pre, post):
  preds_final = []
  for i in range(pre,post):
    if i % 10 == 0:
      print(f'Starting the {i+1}th chunk.')
    pred = get_prediction(obj[i])
    preds_final.append(pred)
  return preds_final


In [None]:
def concat(obj):
    pred_topics_con = []
    pred_topics_final = []
    for i in obj:
        pred_topics_con += i
    for i in pred_topics_con:
        pred_topics_final.append(float(i))
    return pred_topics_final




In [None]:
def lab_to_top(df):
  topic = []
  for value in df['pred_topics']:
    if value == 0:
      topic.append("Economics")
    elif value == 1:
      topic.append("Civil Rights")
    elif value == 2:
      topic.append("Health")
    elif value == 3:
      topic.append("Agriculture")
    elif value == 4:
      topic.append("Labor and Social Welfare")
    elif value == 5:
      topic.append("Education")
    elif value == 6:
      topic.append("Environment")
    elif value == 7:
      topic.append("Energy")
    elif value == 8:
      topic.append("Immigration")
    elif value == 9:
      topic.append("Transportation")
    elif value == 10:
      topic.append("Law and Crime")
    elif value == 11:
      topic.append("Housing")
    elif value == 12:
      topic.append("Defense")
    elif value == 13:
      topic.append("Technology")
    elif value == 14:
      topic.append("International Affairs")
    elif value == 15:
      topic.append("Government Operations")
    else:
      topic.append("Other")
  return(topic)


  

In [None]:
pred_topics_1 = iter_pred(pred_texts_chunk, 0, 500)
pred_topics_concat_1 = concat(pred_topics_1)

In [None]:
len(pred_topics_concat_1)

In [None]:
germaparl_1 = germaparl[:len(pred_topics_concat_1)]

In [None]:
germaparl_1['pred_topics'] = pred_topics_concat_1

In [None]:
germaparl_1.head()

In [None]:
germaparl_1['topic_name'] = lab_to_top(germaparl_1)
germaparl_1.groupby('topic_name').count()

In [None]:
germaparl_1.to_csv('germaparl_pred_1.csv', encoding='utf-8')

In [None]:
pred_topics_2 = iter_pred(pred_texts_chunk, 500, 1000)


In [None]:
pred_topics_concat_2 = concat(pred_topics_2)

In [None]:
len(germaparl)-len(pred_topics_concat_2)

In [None]:
len(pred_topics_concat_2)

In [None]:
germaparl_2 = germaparl[len(germaparl)-len(pred_topics_concat_2):]
germaparl_2['pred_topics'] = pred_topics_concat_2

In [None]:
germaparl_2['topic_name'] = lab_to_top(germaparl_2)

In [None]:
germaparl_2.to_csv('germaparl_pred_2.csv', encoding='utf-8')

In [None]:
germaparl_1 = pd.read_csv("germaparl_pred_1.csv", encoding = "utf-8")

In [None]:
germaparl_2 = pd.read_csv("germaparl_pred_2.csv", encoding = "utf-8")

In [None]:
frames = [germaparl_1, germaparl_2]
final_df = pd.concat(frames)

In [None]:
print(len(germaparl_1))
print(len(germaparl_2))
print(len(germaparl_1)+len(germaparl_2))
print(len(germaparl)-len(pred_topics_concat_2))
print(len(final_df))
print(len(germaparl))

In [None]:
final_df.groupby('topic_name').count()

In [None]:
final_df = final_df[["speaker", "role", "party", "position", "session", "date","bundestag", "year" ,"pred_topics", "topic_name", "length","text"]]

In [None]:
final_df.to_csv('germaparl_pred.csv', encoding='utf-8')