<a href="https://colab.research.google.com/github/limestonestreams/thesis/blob/master/BERT_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To wrap output text, applies to every cell. (from https://stackoverflow.com/questions/58890109/line-wrapping-in-collaboratory-google-results)

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

Connecting to google drive:

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Importing dictionary of datasets for each topic:

In [3]:
import pickle
import pandas as pd
import numpy as np

with open('/content/gdrive/My Drive/Colab files/all_datasets', 'rb') as f:
   d = pickle.load(f)

In [4]:
conflicts = ['Afghanistan', 'Albania', 'Angola', 'Bosnia-Herzegovina', 'Burundi', 'Central African Republic',
             'Colombia', 'Croatia', 'DR Congo (Zaire)', 'El Salvador', 'Georgia', 'Guatemala', 'Guinea-Bissau', 
             'Haiti', 'Honduras', 'Iraq', 'Lebanon', 'Liberia', 'Macedonia, FYR', 'Mali', 'Myanmar (Burma)', 
             'Papua New Guinea', 'Rwanda', 'Serbia (Yugoslavia)', 'Sierra Leone', 'Somalia', 'Sudan', 'Tajikistan',
             'Uganda', 'Ukraine', 'Venezuela']


for i in conflicts:
    k = str(i).lower()
    x = d[k]
    #remove from the dictionary and the conflict array countries with no events
    if 1 not in set(x['event']): 
      print(f"{i} does not have any events.")
      d.pop(k)
      conflicts.remove(i)

print(conflicts)


Albania does not have any events.
Bosnia-Herzegovina does not have any events.
El Salvador does not have any events.
Guatemala does not have any events.
Honduras does not have any events.
Papua New Guinea does not have any events.
['Afghanistan', 'Angola', 'Burundi', 'Central African Republic', 'Colombia', 'Croatia', 'DR Congo (Zaire)', 'Georgia', 'Guinea-Bissau', 'Haiti', 'Iraq', 'Lebanon', 'Liberia', 'Macedonia, FYR', 'Mali', 'Myanmar (Burma)', 'Rwanda', 'Serbia (Yugoslavia)', 'Sierra Leone', 'Somalia', 'Sudan', 'Tajikistan', 'Uganda', 'Ukraine', 'Venezuela']


In [5]:
for i in conflicts:
      k = str(i).lower()
      x = d[k]
      #print(f"{i} has {set(x['event'])} events.")
      #check dummy for whether there is a meeting on that date
      x['meeting'] = x['meeting'].fillna(0)
      print(f'For {i} the meeting numbers are:')
      print(x['meeting'].value_counts())
        #Sudan, DRC, and Angola look good to run predictions on.
        #Sudan, Iraq, Afghanistan
      d[k] = x

For Afghanistan the meeting numbers are:
0.0    34533
1.0      445
Name: meeting, dtype: int64
For Angola the meeting numbers are:
0.0    2941
1.0      35
Name: meeting, dtype: int64
For Burundi the meeting numbers are:
0.0    9099
1.0      55
Name: meeting, dtype: int64
For Central African Republic the meeting numbers are:
0.0    8631
1.0      71
Name: meeting, dtype: int64
For Colombia the meeting numbers are:
0.0    1358
1.0      18
Name: meeting, dtype: int64
For Croatia the meeting numbers are:
0.0    961
1.0      7
Name: meeting, dtype: int64
For DR Congo (Zaire) the meeting numbers are:
0.0    9978
1.0     106
Name: meeting, dtype: int64
For Georgia the meeting numbers are:
0.0    5266
1.0      18
Name: meeting, dtype: int64
For Guinea-Bissau the meeting numbers are:
0.0    7071
1.0      42
Name: meeting, dtype: int64
For Haiti the meeting numbers are:
0.0    8981
1.0      54
Name: meeting, dtype: int64
For Iraq the meeting numbers are:
0.0    10961
1.0      120
Name: meeting, d

In [6]:
for i in conflicts:
    k = str(i).lower()
    x = d[k]
    #remove meetings with lower than 250 words (likely only procedural)      
    x = x[(x['meeting'] == 0) | (x['word_count'] >= 250)]
    d[k] = x

In [7]:
print(f'For Afghanistan the meeting numbers are:')
print(d['afghanistan']['meeting'].value_counts())
#d['afghanistan']

For Afghanistan the meeting numbers are:
0.0    34533
1.0      432
Name: meeting, dtype: int64


In [8]:
x = d['afghanistan'].groupby('text')['date'].nunique()

x.describe()

count    86.0
mean      1.0
std       0.0
min       1.0
25%       1.0
50%       1.0
75%       1.0
max       1.0
Name: date, dtype: float64

In [9]:
d['angola']

Unnamed: 0,date,topic,text,word_count,meeting,event
0,1995-02-08,Angola,"i begin by\ncongratulating you, sir, on behalf...",31967.0,1.0,0.0
1,1995-02-09,,,,0.0,0.0
2,1995-02-10,,,,0.0,0.0
3,1995-02-11,,,,0.0,0.0
4,1995-02-12,,,,0.0,0.0
...,...,...,...,...,...,...
2971,2002-07-20,,,,0.0,0.0
2972,2002-07-21,,,,0.0,0.0
2973,2002-07-22,,,,0.0,1.0
2974,2002-07-23,,,,0.0,0.0


In [10]:
for i in conflicts:
    k = str(i).lower()
    x = d[k]
    #removing duplicate rows 
    #(coming from multiple events on the same day which caused text duplicates when merging docs and events dataframes)
    x.drop_duplicates(inplace = True)
    x = x.reset_index(drop = True)   
    d[k] = x  

In [11]:
%%time
for i in conflicts:
  #adding dummy for whether there is an event in the three days following each meeting
  k = str(i).lower()
  d[k]['label'] = ''
  j = 0
  while j+4 <= len(d[k].index):
        d[k].iloc[j, 6] = np.where(sum(d[k].iloc[j+1:j+4, 5]) > 0, 1, 0)
        j = j + 1

CPU times: user 1min 16s, sys: 203 ms, total: 1min 16s
Wall time: 1min 16s


In [12]:
for i in conflicts:
  print(f"For {i} the event labels are: ")
  print(d[str(i).lower()]['label'].value_counts())

For Afghanistan the event labels are: 
1    7543
0    1110
        3
Name: label, dtype: int64
For Angola the event labels are: 
0    1562
1    1156
        3
Name: label, dtype: int64
For Burundi the event labels are: 
0    6244
1    2589
        3
Name: label, dtype: int64
For Central African Republic the event labels are: 
0    6258
1    1881
        3
Name: label, dtype: int64
For Colombia the event labels are: 
0    1046
1     313
        3
Name: label, dtype: int64
For Croatia the event labels are: 
0    913
1     38
       3
Name: label, dtype: int64
For DR Congo (Zaire) the event labels are: 
1    4564
0    3872
        3
Name: label, dtype: int64
For Georgia the event labels are: 
0    5193
1      80
        3
Name: label, dtype: int64
For Guinea-Bissau the event labels are: 
0    7105
1       3
        3
Name: label, dtype: int64
For Haiti the event labels are: 
0    8952
1      75
        3
Name: label, dtype: int64
For Iraq the event labels are: 
1    5178
0    2017
       

In [13]:
for i in conflicts:
    k = str(i).lower()
    #getting rid of superflous columns
    del d[k]['meeting']
    del d[k]['date']
    del d[k]['topic']
    del d[k]['word_count']
    del d[k]['event']

In [14]:
for i in conflicts:
      k = str(i).lower()
      #dropping last 3 rows
      d[k] = d[k][:-3]

In [15]:
import re
def clean_txt(text):
  text = re.sub("'", "", text)
  #text=re.sub("(\\W)+"," ",text)
  text = re.sub(r'\n', ' ', text)    
  return text

for i in conflicts:
  print(i)
  k = str(i).lower()
  d[k]['text'] = d[k]['text'].astype(str)
  d[k]['text'] = d[k]['text'].apply(clean_txt)
  d[k]['label'] = d[k]['label'].astype(int)


Afghanistan
Angola
Burundi
Central African Republic
Colombia
Croatia
DR Congo (Zaire)
Georgia
Guinea-Bissau
Haiti
Iraq
Lebanon
Liberia
Macedonia, FYR
Mali
Myanmar (Burma)
Rwanda
Serbia (Yugoslavia)
Sierra Leone
Somalia
Sudan
Tajikistan
Uganda
Ukraine
Venezuela


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [16]:
d['afghanistan']['text'].describe()

count     8653
unique      87
top        nan
freq      8567
Name: text, dtype: object

Removing empty text rows:

In [17]:
for i in conflicts:
  k = str(i).lower()
  x = d[k]
  x = x[x.text != 'nan']
  x.reset_index(drop = True, inplace = True)
  d[k] = x

From https://github.com/ArmandDS/bert_for_long_text/blob/master/final_bert_long_docs.ipynb:

In [18]:
conflicts

['Afghanistan',
 'Angola',
 'Burundi',
 'Central African Republic',
 'Colombia',
 'Croatia',
 'DR Congo (Zaire)',
 'Georgia',
 'Guinea-Bissau',
 'Haiti',
 'Iraq',
 'Lebanon',
 'Liberia',
 'Macedonia, FYR',
 'Mali',
 'Myanmar (Burma)',
 'Rwanda',
 'Serbia (Yugoslavia)',
 'Sierra Leone',
 'Somalia',
 'Sudan',
 'Tajikistan',
 'Uganda',
 'Ukraine',
 'Venezuela']

Clean the text columns:

Splitting into train and test set:

In [24]:
from sklearn.model_selection import train_test_split

for i in conflicts:
    print(i)
    k = str(i).lower()
    train = d[k]
    if train.shape[0] >= 5:
      train = train.reindex(np.random.permutation(train.index))
      
      train, val = train_test_split(train, test_size = 0.2, random_state = 35)
      new_key_train = k + '_train'
      new_key_val = k + '_val'
      d[new_key_train] = train
      d[new_key_val] = val

      d[new_key_train].reset_index(drop = True, inplace = True)
      d[new_key_val].reset_index(drop = True, inplace = True)  

      print(f"Training Set Shape for {i}:", d[new_key_train].shape)
      print(f"Validation Set Shape for {i}:", d[new_key_val].shape)
      print('\n')
    else:
      #d.pop(k)
      print(f'{i} is removed from the dictionary.')
      print('\n')
      conflicts.remove(i)

Afghanistan
Training Set Shape for Afghanistan: (68, 2)
Validation Set Shape for Afghanistan: (18, 2)


Angola
Training Set Shape for Angola: (24, 2)
Validation Set Shape for Angola: (7, 2)


Burundi
Training Set Shape for Burundi: (41, 2)
Validation Set Shape for Burundi: (11, 2)


Central African Republic
Training Set Shape for Central African Republic: (44, 2)
Validation Set Shape for Central African Republic: (11, 2)


Colombia
Training Set Shape for Colombia: (13, 2)
Validation Set Shape for Colombia: (4, 2)


Croatia
Training Set Shape for Croatia: (4, 2)
Validation Set Shape for Croatia: (2, 2)


DR Congo (Zaire)
Training Set Shape for DR Congo (Zaire): (68, 2)
Validation Set Shape for DR Congo (Zaire): (18, 2)


Georgia
Training Set Shape for Georgia: (12, 2)
Validation Set Shape for Georgia: (3, 2)


Guinea-Bissau
Training Set Shape for Guinea-Bissau: (32, 2)
Validation Set Shape for Guinea-Bissau: (9, 2)


Haiti
Training Set Shape for Haiti: (41, 2)
Validation Set Shape for H

In [20]:
d['afghanistan_val']

Unnamed: 0,text,label
0,i would like to thank france for its leadershi...,1
1,"i wish, first of all, to warmly thank mr. gueh...",1
2,i was expecting to listen to mr. de mistura fi...,1
3,i would first like to begin by expressing my a...,1
4,"i would like to thank you, mr. president, for ...",1
5,i congratulate angola on its successful leader...,1
6,"at the outset, i would like to express the dee...",1
7,let me begin by congratulating the netherlands...,1
8,it is truly a pleasure to be among the members...,1
9,togo welcomes the adoption of resolution 2069 ...,1


In [21]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'
# The list containing all the classes (train['SECTION'].unique())
#label_list = [x for x in np.unique(train.label)]
label_list = [0, 1]
label_list

[0, 1]

Concatenating all the _train and _val dictionary entries into 2 full tables:

In [22]:
#creating arrays of dataframe names
keys_train = []
keys_val = []
for i in conflicts:
  s_train = str(i).lower() + '_train'
  keys_train = np.append(keys_train, s_train)
  s_val = str(i).lower() + '_val'
  keys_val = np.append(keys_val, s_val)

print(keys_train)
print(keys_val)

['afghanistan_train' 'angola_train' 'burundi_train'
 'central african republic_train' 'colombia_train' 'croatia_train'
 'dr congo (zaire)_train' 'georgia_train' 'guinea-bissau_train'
 'haiti_train' 'iraq_train' 'liberia_train' 'macedonia, fyr_train'
 'mali_train' 'myanmar (burma)_train' 'rwanda_train' 'sierra leone_train'
 'somalia_train' 'sudan_train' 'tajikistan_train' 'ukraine_train']
['afghanistan_val' 'angola_val' 'burundi_val'
 'central african republic_val' 'colombia_val' 'croatia_val'
 'dr congo (zaire)_val' 'georgia_val' 'guinea-bissau_val' 'haiti_val'
 'iraq_val' 'liberia_val' 'macedonia, fyr_val' 'mali_val'
 'myanmar (burma)_val' 'rwanda_val' 'sierra leone_val' 'somalia_val'
 'sudan_val' 'tajikistan_val' 'ukraine_val']


In [25]:
frames_train = [d[k] for k in keys_train]
frames_val = [d[k] for k in keys_val]
train = pd.concat(frames_train)
val = pd.concat(frames_val)

In [31]:
print(train['text'].describe())
print(val['text'].describe())

count                                                   768
unique                                                  768
top       i should like to begin by thanking under-secre...
freq                                                      1
Name: text, dtype: object
count                                                   206
unique                                                  206
top        mr. president, allow me at the outset, on beh...
freq                                                      1
Name: text, dtype: object


Splitting the data into smaller chunks:

In [32]:
def get_split(text1):
  l_total = []
  l_parcial = []
  if len(text1.split())//150 >0:
    n = len(text1.split())//150
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      l_parcial = text1.split()[:200]
      l_total.append(" ".join(l_parcial))
    else:
      l_parcial = text1.split()[w*150:w*150 + 200]
      l_total.append(" ".join(l_parcial))
  return l_total

In [33]:
%%time
train['text_split'] = train[DATA_COLUMN].apply(get_split)
train.head()

Unnamed: 0,text,label,text_split
0,i would like to thank under- secretary-general...,1,[i would like to thank under- secretary-genera...
1,i thank special representative yamamoto and th...,1,[i thank special representative yamamoto and t...
2,"i would like to thank you, mr. president, for ...",1,"[i would like to thank you, mr. president, for..."
3,"i would like to thank you, sir, for your leade...",1,"[i would like to thank you, sir, for your lead..."
4,france has just voted for the draft resolutio...,1,[france has just voted for the draft resolutio...


In [34]:
%%time
val['text_split'] = val[DATA_COLUMN].apply(get_split)
val.head(2)

Unnamed: 0,text,label,text_split
0,i have the honour to speak on behalf of the eu...,1,[i have the honour to speak on behalf of the e...
1,i have the honour to speak on behalf of the eu...,0,[i have the honour to speak on behalf of the e...


In [35]:
train_l = []
label_l = []
index_l =[]
for idx,row in train.iterrows():
  for l in row['text_split']:
    train_l.append(l)
    label_l.append(row['label'])
    index_l.append(idx)
len(train_l), len(label_l), len(index_l)

(38313, 38313, 38313)

In [36]:
val_l = []
val_label_l = []
val_index_l = []
for idx,row in val.iterrows():
  for l in row['text_split']:
    val_l.append(l)
    val_label_l.append(row['label'])
    val_index_l.append(idx)
len(val_l), len(val_label_l), len(val_index_l)

(9355, 9355, 9355)

The final dataset for training:

In [37]:
train_df = pd.DataFrame({DATA_COLUMN:train_l, LABEL_COLUMN:label_l})
train_df.head()

Unnamed: 0,text,label
0,i would like to thank under- secretary-general...,1
1,renewed commitment by the international commun...,1
2,"not only on the holding of the elections, but ...",1
3,"new prts will soon be operative, providing hel...",1
4,with tighter border controls and improved info...,1


And the final dataset for validation:

In [38]:
val_df = pd.DataFrame({DATA_COLUMN:val_l, LABEL_COLUMN:val_label_l})
val_df.head()

Unnamed: 0,text,label
0,i have the honour to speak on behalf of the eu...,1
1,"afghan people and adds to their misery, while ...",1
2,italian national and left a french national in...,1
3,on diplomatic relations and on consular relati...,1
4,"to afghan factions, including involvement of f...",1


Saving pre-processed datasets for training and validation:

In [40]:
import pickle

#with open('/content/gdrive/My Drive/Colab files/bert_training_df_sudan', 'wb') as f:
   #pickle.dump(train_df, f)

#with open('/content/gdrive/My Drive/Colab files/bert_validation_df_sudan', 'wb') as f:
   #pickle.dump(val_df, f)  

with open('/content/gdrive/My Drive/Colab files/bert_training_df', 'wb') as f:
   pickle.dump(train_df, f)

with open('/content/gdrive/My Drive/Colab files/bert_validation_df', 'wb') as f:
   pickle.dump(val_df, f)  

with open('/content/gdrive/My Drive/Colab files/bert_train', 'wb') as f:
   pickle.dump(train, f)

with open('/content/gdrive/My Drive/Colab files/bert_val', 'wb') as f:
   pickle.dump(val, f)  
  