In [2]:
import pandas as pd
import math
from transformers import pipeline
from transformers import BertConfig
from transformers import BertModel
from transformers import *

import torch
torch.cuda.is_available()



In [None]:
class bert_model():
    def __init__(self, name):
        self.name = name
        self.config = BertConfig(output_hidden_states=True)
        self.tokenizer = BertTokenizer.from_pretrained(name, max_length=512, truncation=True)
        self.object = BertModel.from_pretrained(name, output_hidden_states = True)


def calculate_num_tokens(keyword, tokenizer):    
    tokenized_text = tokenizer.tokenize("[CLS] " + keyword + " [SEP]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    return len(indexed_tokens[1:-1])

In [49]:
#Creating tokens data with bert-base-cased-tokenizer
bert_base_cased= bert_model('bert-base-cased')

tokens_data = pd.DataFrame()
for name in un_data['Name'].values:
    tokens_data = pd.concat([tokens_data, pd.DataFrame([[name, calculate_num_tokens(name, bert_base_cased.tokenizer)]])])
tokens_data.columns = ['Name', 'num_tokens']

In [50]:
#Read in frequency data
wiki_freq = pd.read_csv("counting_country_frequencies/country_counts_wiki.csv")
wiki_freq.columns = ['keyword', 'freq']

book_corpus = pd.read_csv("counting_country_frequencies/country_counts_bookcorpus.csv")
book_corpus.columns = ['keyword', 'freq']

freq = wiki_freq.set_index("keyword").join(book_corpus.set_index("keyword"), how='outer', lsuffix="_bc", rsuffix='_wiki').reset_index()
#fill na if one corpus is missing keywords
freq = freq.fillna(0)
freq['freq'] = freq['freq_wiki'] + freq['freq_bc']
freq['freq_logged'] = freq['freq'].apply(lambda x: math.log(x))


In [57]:
#Combine frequency data with GDP data
un_data = pd.read_csv("country_metadata/un_countries.csv")
un_data['name'] = un_data['Name'].apply(lambda x:x.lower())
un_data['gdp'] = un_data['Estimate'].apply(lambda x: int (x.replace(",", "")))
un_data['gdp_logged'] = un_data['gdp'].apply(lambda x: math.log(x))
un_data = un_data.set_index("Name").join(freq[['keyword', 'freq', 'freq_logged']].set_index('keyword'), how='inner')
un_data = un_data.reset_index()

Unnamed: 0,index,Official_Name,Same,Match,Country/Territory,Region,Estimate,Year,name,gdp,gdp_logged,freq,freq_logged
0,Cote d'Ivoire,Côte d'Ivoire,False,False,Ivory Coast,Africa,58539,2019,cote d'ivoire,58539,10.977448,264.0,5.575949
1,Democratic Republic of the Congo,Democratic Republic of the Congo,True,False,DR Congo,Africa,47319,2019,democratic republic of the congo,47319,10.764667,11229.0,9.326255
2,East Timor,Timor-Leste,False,True,East Timor,Asia,2017,2019,east timor,2017,7.609367,5834.0,8.671458
3,Micronesia,Federated States of Micronesia,False,True,Micronesia,Oceania,414,2019,micronesia,414,6.025866,3395.0,8.130059
4,Moldova,Republic of Moldova,False,True,Moldova,Europe,11955,2019,moldova,11955,9.388905,13791.0,9.531771
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,Uzbekistan,Uzbekistan,True,True,Uzbekistan,Asia,57921,2019,uzbekistan,57921,10.966835,13244.0,9.491300
189,Vanuatu,Vanuatu,True,True,Vanuatu,Oceania,906,2019,vanuatu,906,6.809039,6379.0,8.760767
190,Yemen,Yemen,True,True,Yemen,Asia,24935,2019,yemen,24935,10.124028,19004.0,9.852405
191,Zambia,Zambia,True,True,Zambia,Africa,23085,2019,zambia,23085,10.046938,15048.0,9.619000


In [58]:
#Combine with tokenization data
un_data = un_data.set_index("index").join(tokens_data.set_index('Name'), how='outer').reset_index()
un_data = un_data[['index', 'Region', 'gdp', 'gdp_logged', 'freq', 'freq_logged', 'num_tokens']]
un_data.columns = ['Name', 'Region', 'gdp', 'gdp_logged', 'freq', 'freq_logged', 'subpieces']
un_data = un_data.reset_index(drop=True)

In [59]:
# un_data.to_csv("un_countries_meta.csv")

Unnamed: 0,Name,Region,gdp,gdp_logged,freq,freq_logged,subpieces
0,Cote d'Ivoire,Africa,58539,10.977448,264.0,5.575949,7
1,Democratic Republic of the Congo,Africa,47319,10.764667,11229.0,9.326255,5
2,East Timor,Asia,2017,7.609367,5834.0,8.671458,2
3,Micronesia,Oceania,414,6.025866,3395.0,8.130059,2
4,Moldova,Europe,11955,9.388905,13791.0,9.531771,1
...,...,...,...,...,...,...,...
188,Uzbekistan,Asia,57921,10.966835,13244.0,9.491300,1
189,Vanuatu,Oceania,906,6.809039,6379.0,8.760767,3
190,Yemen,Asia,24935,10.124028,19004.0,9.852405,1
191,Zambia,Africa,23085,10.046938,15048.0,9.619000,1
