# Network approach
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6051742/

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import re

In [2]:
DATASET_PATH = "parliament_13.csv"

In [3]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.head()

Unnamed: 0,section,date,topic,name,speech,length
0,answers to questions,2020-06-05,consequences for and enforcement against emplo...,josephine teo,the vast majority of employers are now paying ...,142
1,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,"thank you, mr chairman. i thank the senior min...",79
2,budget,2020-06-05,second supplementary estimates of expenditure ...,indranee rajah,"mr chairman, if i may now address ms ong's que...",261
3,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,"mr chairman, i beg leave to withdraw the amend...",36
4,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,"chairman, i beg to move, ""that the total sum t...",440


In [5]:
documents = [re.sub(r"[^\w\s]", "", sentence.strip()) for sentence in df.loc[0, "speech"].split(".") if sentence.strip()]
documents

['the vast majority of employers are now paying salaries electronically',
 'in the initial month of implementation a very small number of employers were unable to do so due to technical or administrative issues',
 'for example their workers bank account applications were still pending approval by the bank',
 'in these cases the ministry of manpower mom allowed the employers to continue to pay outstanding salaries in cash',
 'due to its urgent necessity this new requirement was introduced at very short notice',
 'nevertheless the vast majority of employers have complied',
 'mom will continue to follow up and remind the remaining employers to make the switch if they have not already done so especially now that circuit breaker measures have been lifted',
 'mom will in due course take enforcement action against noncompliance as it would for other conditions under the employment of foreign manpower act']

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')

# tokenize
tokens = [wordpunct_tokenize(document) for document in documents]
# remove stop words
tokens_nostops = [[word for word in sent if word not in stop_words] for sent in tokens]
# lemmatize
lemmatizer = WordNetLemmatizer()
tokens_nostops_lemmatized = [[lemmatizer.lemmatize(word) for word in sent] for sent in tokens_nostops]
# make a corpus
corpus = [' '.join(sent) for sent in tokens_nostops_lemmatized]
corpus

['vast majority employer paying salary electronically',
 'initial month implementation small number employer unable due technical administrative issue',
 'example worker bank account application still pending approval bank',
 'case ministry manpower mom allowed employer continue pay outstanding salary cash',
 'due urgent necessity new requirement introduced short notice',
 'nevertheless vast majority employer complied',
 'mom continue follow remind remaining employer make switch already done especially circuit breaker measure lifted',
 'mom due course take enforcement action noncompliance would condition employment foreign manpower act']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
X.shape

(8, 64)

In [19]:
vectorizer.get_feature_names_out()

array(['account', 'act', 'action', 'administrative', 'allowed', 'already',
       'application', 'approval', 'bank', 'breaker', 'case', 'cash',
       'circuit', 'complied', 'condition', 'continue', 'course', 'done',
       'due', 'electronically', 'employer', 'employment', 'enforcement',
       'especially', 'example', 'follow', 'foreign', 'implementation',
       'initial', 'introduced', 'issue', 'lifted', 'majority', 'make',
       'manpower', 'measure', 'ministry', 'mom', 'month', 'necessity',
       'nevertheless', 'new', 'noncompliance', 'notice', 'number',
       'outstanding', 'pay', 'paying', 'pending', 'remaining', 'remind',
       'requirement', 'salary', 'short', 'small', 'still', 'switch',
       'take', 'technical', 'unable', 'urgent', 'vast', 'worker', 'would'],
      dtype=object)

In [22]:
Gx = nx.Graph()
Gx.add_nodes_from()
Gx.add_nodes_from(np.arange(X.shape[0]))