# Understanding Japleen's Preprocessing

In [1]:
import numpy as np
import pandas as pd
import pickle
import gensim
import tqdm
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from csv import reader
from scipy import spatial
import functools
from nltk.stem.snowball import SnowballStemmer
import spacy
import click
from spacy.tokenizer import Tokenizer
import datetime
from spacy.lang.en import English


## Parameters

In [2]:
# time to split
split_date = datetime.datetime.strptime('01-01-2010', '%d-%m-%Y')

## Tokenizing ParlSpeech

In [3]:
x = pd.read_csv('/home/hubert/semantic-change-hansard/data/parlspeechv2/Corp_HouseOfCommons_V2.csv')

In [4]:
x.columns

Index(['date', 'agenda', 'speechnumber', 'speaker', 'party', 'party.facts.id',
       'chair', 'terms', 'text', 'parliament', 'iso3country'],
      dtype='object')

In [5]:
x.shape

(1956223, 11)

In [6]:
x.head()

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country
0,1988-11-22,Queen's Speech,1,CHAIR,,,True,74,I have to acquaint the House that this House h...,UK-HouseOfCommons,GBR
1,1988-11-22,First Day [Debate On The Address],2,CHAIR,,,True,55,It may be for the convenience of the House if ...,UK-HouseOfCommons,GBR
2,1988-11-22,First Day [Debate On The Address],3,Giles Shaw,Con,1567.0,False,2511,"I beg to move,. That an humble Address be pres...",UK-HouseOfCommons,GBR
3,1988-11-22,First Day [Debate On The Address],4,John Maples,Con,1567.0,False,1470,I am delighted to second the motion. When I ha...,UK-HouseOfCommons,GBR
4,1988-11-22,First Day [Debate On The Address],5,Neil Kinnock,Lab,1516.0,False,2768,I am sure that I speak for the majority of hon...,UK-HouseOfCommons,GBR


In [7]:
nlp = English()
tk = Tokenizer(nlp.vocab)

In [8]:
# x.text_tokenized = x.text.apply(tk, batch_size=50)
tokenized_texts = []

In [11]:
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

n_cpus = multiprocessing.cpu_count()
print(f'Available CPUs: {n_cpus}')

batch_size=50

def divide_chunks(texts, n):
    # looping till length l
    for i in range(0, len(texts), n):
        yield texts[i:i + n]

def tokenize_batch(batch):
    batch_size=len(batch)
    return tk.pipe(batch, batch_size=batch_size)

# test one batch
list(tokenize_batch(x.text[:10]))[0]



Available CPUs: 20


I have to acquaint the House that this House has this day attended Her Majesty in the House of Peers, and that Her Majesty was pleased to make a Most Gracious Speech from the Throne to both Houses of Parliament, of which I have, for greater accuracy, obtained a copy. I shall direct that the terms of the Gracious Speech be printed in the Votes and Proceedings. Copies are available in the Vote Office.

In [13]:
import nltk
from nltk.tokenize import RegexpTokenizer

def tki(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    return tokens

tokenised_text = x.text.apply(tki)

In [15]:
x['tokenized'] = tokenised_text

In [16]:
x

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country,tokenized
0,1988-11-22,Queen's Speech,1,CHAIR,,,True,74,I have to acquaint the House that this House h...,UK-HouseOfCommons,GBR,"[i, have, to, acquaint, the, house, that, this..."
1,1988-11-22,First Day [Debate On The Address],2,CHAIR,,,True,55,It may be for the convenience of the House if ...,UK-HouseOfCommons,GBR,"[it, may, be, for, the, convenience, of, the, ..."
2,1988-11-22,First Day [Debate On The Address],3,Giles Shaw,Con,1567.0,False,2511,"I beg to move,. That an humble Address be pres...",UK-HouseOfCommons,GBR,"[i, beg, to, move, that, an, humble, address, ..."
3,1988-11-22,First Day [Debate On The Address],4,John Maples,Con,1567.0,False,1470,I am delighted to second the motion. When I ha...,UK-HouseOfCommons,GBR,"[i, am, delighted, to, second, the, motion, wh..."
4,1988-11-22,First Day [Debate On The Address],5,Neil Kinnock,Lab,1516.0,False,2768,I am sure that I speak for the majority of hon...,UK-HouseOfCommons,GBR,"[i, am, sure, that, i, speak, for, the, majori..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1956218,2019-12-17,Election of Speaker,14,Iain Stewart,Con,1567.0,False,34,I have to signify to the House the pleasure of...,UK-HouseOfCommons,GBR,"[i, have, to, signify, to, the, house, the, pl..."
1956219,2019-12-17,Election of Speaker,15,CHAIR,,,True,7,The House is suspended until 3.45 pm.,UK-HouseOfCommons,GBR,"[the, house, is, suspended, until, 3, 45, pm]"
1956220,2019-12-17,MESSAGE TO ATTEND THE LORDS COMMISSIONERS,16,CHAIR,,,True,82,I have to report that this House has been in t...,UK-HouseOfCommons,GBR,"[i, have, to, report, that, this, house, has, ..."
1956221,2019-12-17,Members Sworn,17,CHAIR,,,True,54,"I must now call on the Members of the House, a...",UK-HouseOfCommons,GBR,"[i, must, now, call, on, the, members, of, the..."


## Experiments with previous Hansard data

In [17]:
import pickle
with open('/home/hubert/japleen/semantic-change-hansard-dev/Resources/hansard-speeches-post2010.pkl', 'rb') as f:
    hansard = pickle.load(f)

In [18]:
with open('/home/hubert/japleen/semantic-change-hansard-dev/Resources/TokenizedSpeeches_utf-8.csv', 'r') as read_obj:

    lemmasList = []
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Iterate over each row in the csv using reader object
    for row in csv_reader: 
        lemmasList.append(row)
    print(len(lemmasList), 'Rows read')

630368 Rows read


In [19]:
dictOfLemmas = {'Lemmas': lemmasList}
lemmasDf = pd.DataFrame(dictOfLemmas)
lemmasDf

Unnamed: 0,Lemmas
0,"[acquaint, house, house, day, attended, majest..."
1,"[call, mover, seconder, want, announce, propos..."
2,"[beg, move, humble, address, presented, majest..."
3,"[delighted, follow, right, member, hitchin, ha..."
4,"[sure, whole, house, join, paying, tribute, kn..."
...,...
630363,"[congratulate, gentleman, securing, debate, ma..."
630364,"[thank, gentleman, agree, underlying, lack, re..."
630365,"[congratulations, new, position, mr, speaker, ..."
630366,"[friend, right, open, clear, challenges, face,..."


In [20]:
#since index was missing values and didn't match with the lemmasDf index
df = hansard
df = df.reset_index(drop=True)
df = df.join(lemmasDf)
df['Lemmas']

0         [acquaint, house, house, day, attended, majest...
1         [call, mover, seconder, want, announce, propos...
2         [beg, move, humble, address, presented, majest...
3         [delighted, follow, right, member, hitchin, ha...
4         [sure, whole, house, join, paying, tribute, kn...
                                ...                        
630363    [congratulate, gentleman, securing, debate, ma...
630364    [thank, gentleman, agree, underlying, lack, re...
630365    [congratulations, new, position, mr, speaker, ...
630366    [friend, right, open, clear, challenges, face,...
630367    [thank, friend, assiduous, campaigner, issue, ...
Name: Lemmas, Length: 630368, dtype: object

In [21]:
df.iloc[0]['speech']

"I have to acquaint the House that this House has this day attended Her Majesty in the House of Peers, and that Her Majesty was pleased to make a Most Gracious Speech from the Throne to both Houses of Parliament, of which I have, for greater accuracy, obtained a copy.I shall direct that the terms of the Gracious Speech be printed in the Votes and Proceedings. Copies are available in the Vote Office.The Gracious Speech was as follows:My Lords and Members of the House of Commons My Government's legislative programme will be based upon the principles of freedom, fairness and responsibility. The first priority is to reduce the deficit and restore economic growth. Action will be taken to accelerate the reduction of the structural budget deficit. A new Office for Budget Responsibility will provide confidence in the management of the public finances. The tax and benefits system will be made fairer and simpler. Changes to National Insurance will safeguard jobs and support the economy. People w

In [24]:
df.iloc[0]['Lemmas']

['acquaint',
 'house',
 'house',
 'day',
 'attended',
 'majesty',
 'house',
 'peers',
 'majesty',
 'pleased',
 'make',
 'gracious',
 'speech',
 'throne',
 'houses',
 'parliament',
 'greater',
 'accuracy',
 'obtained',
 'shall',
 'direct',
 'terms',
 'gracious',
 'speech',
 'printed',
 'votes',
 'proceedings',
 'copies',
 'available',
 'vote',
 'gracious',
 'speech',
 'follows',
 'lords',
 'members',
 'house',
 'commons',
 'government',
 'legislative',
 'programme',
 'based',
 'upon',
 'principles',
 'freedom',
 'fairness',
 'responsibility',
 'first',
 'priority',
 'reduce',
 'deficit',
 'restore',
 'economic',
 'growth',
 'action',
 'taken',
 'accelerate',
 'reduction',
 'structural',
 'budget',
 'deficit',
 'new',
 'office',
 'budget',
 'responsibility',
 'provide',
 'confidence',
 'management',
 'public',
 'finances',
 'tax',
 'benefits',
 'system',
 'made',
 'fairer',
 'simpler',
 'changes',
 'national',
 'insurance',
 'safeguard',
 'jobs',
 'support',
 'economy',
 'people',
 'supp