In [1]:
import sqlite3
import csv
import json
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from wordcloud import WordCloud
import plotly.express as px
from gensim.models import Word2Vec
import seaborn as sns
from utils import run_sql, read_archive, find_dynamic_topic

# PAP

In [None]:
df_pap = read_archive("pap/")
print("PAP", df_pap.shape)
df_pap.head()

### Window topics

Number of quarters

In [None]:
df_pap["quarter"].unique().shape

Number of window topics

In [None]:
df_pap[["quarter","window_topic_id"]].drop_duplicates().sort_values(by="window_topic_id").groupby("quarter")[["window_topic_id"]].size().reset_index(name='Num of topics')["Num of topics"].sum()

Num of speeches vs num of topics per quarter

In [None]:
df_pap_quarterly_dist_speech = df_pap.groupby("quarter")[["quarter"]].size().reset_index(name='Num of speeches')
fig = plt.figure(figsize=(18,4))
ax = fig.add_subplot(111)
line1 = ax.plot(df_pap_quarterly_dist_speech["quarter"], df_pap_quarterly_dist_speech["Num of speeches"], alpha=0.5, label="Num of speeches")

df_pap_quarterly_dist_topics = df_pap[["quarter","window_topic_id"]].drop_duplicates().sort_values(by="window_topic_id").groupby("quarter")[["window_topic_id"]].size().reset_index(name='Num of topics')
ax2 = plt.twinx()
line2 = ax2.plot(df_pap_quarterly_dist_topics["quarter"], df_pap_quarterly_dist_topics["Num of topics"], c='g', label="Num of topics")


lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper center')
ax2.set_ylabel("Num of topics")

ax.grid(axis='y')
ax.set_xlabel("Quarter")
ax.set_ylabel("Num of speeches")
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.xaxis.set_major_locator(MultipleLocator(5))
plt.show()

Correlation between topics and speeches

In [None]:
sp.stats.pearsonr(df_pap_quarterly_dist_topics["Num of topics"], df_pap_quarterly_dist_speech["Num of speeches"])

Coherence scores of window topics

In [None]:
df_pap_quarterly_dist_topic_coherence = df_pap[["quarter","window_topic_id", "window_topic_coherence"]].drop_duplicates()
df_pap_quarterly_dist_topic_coherence = df_pap_quarterly_dist_topic_coherence.groupby(["quarter"])[["window_topic_coherence"]].mean().reset_index()
fig = plt.figure(figsize=(18,3))
ax = fig.add_subplot(111)
ax.plot(df_pap_quarterly_dist_topic_coherence["quarter"], df_pap_quarterly_dist_topic_coherence["window_topic_coherence"])
ax.set_ylim([0, 0.5])
ax.set_xlabel("Quarter")
ax.set_ylabel("Coherence")
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.xaxis.set_major_locator(MultipleLocator(5))
plt.show()

In [None]:
df_pap_quarterly_dist_topic_coherence["window_topic_coherence"].describe()

2nd layer topics sorted by coherence scores

In [None]:
df_second_layer = df_pap[["dynamic_topic_id","dynamic_topic_terms", "dynamic_topic_coherence", "quarter"]]\
.drop_duplicates(["dynamic_topic_id", "quarter"])\
.groupby(["dynamic_topic_id","dynamic_topic_terms", "dynamic_topic_coherence"])[["quarter"]]\
.size().reset_index(name='Freq').sort_values(by="dynamic_topic_coherence", ascending=False).reset_index(drop=True)
df_second_layer.to_csv("pap_dt.csv")
df_second_layer

Mean coherence scores of dynamic topics

In [None]:
df_second_layer["dynamic_topic_coherence"].mean()

Window topics of a particular dynamic topic

In [None]:
print("topic:", df_pap[df_pap["dynamic_topic_id"] == 11][["dynamic_topic_id","dynamic_topic_terms"]].drop_duplicates()["dynamic_topic_terms"].values[0])
df_dyn_topic = find_dynamic_topic(df_pap, 11)
df_dyn_topic["window_topic_terms"] = df_dyn_topic["window_topic_terms"].map(lambda x: "\n".join(x.split()))
df_dyn_topic = df_dyn_topic.T
df_dyn_topic.to_csv("table1.csv")
df_dyn_topic.head()

In [None]:
df_table = df_pap[df_pap["dynamic_topic_id"] == 11].groupby("quarter").size().reset_index(name="Number of speeches")
fig = plt.figure(figsize=(18,6))
ax = fig.add_subplot(111)
for label in ax.get_xticklabels():
    label.set_rotation(90)
    label.set_fontsize(8)
ax.bar(df_table["quarter"], df_table["Number of speeches"])
plt.show()

Priorities of topics

In [None]:
df_by_speeches = df_pap.groupby(["dynamic_topic_id", "dynamic_topic_terms"])[["dynamic_topic_id"]].size().reset_index(name="Num of speeches").sort_values(by="Num of speeches", ascending=False).reset_index(drop=True)
df_by_speeches.head()

In [None]:
df_by_topics = df_pap[["dynamic_topic_id", "dynamic_topic_terms", "window_topic_id"]].drop_duplicates()\
.groupby(["dynamic_topic_id", "dynamic_topic_terms"])\
[["dynamic_topic_id"]].size().reset_index(name="Num of window topics").sort_values(by="Num of window topics", ascending=False).reset_index(drop=True)
df_by_topics.head()

In [None]:
y = df_by_speeches["Num of speeches"][::-1]
x = df_by_speeches["dynamic_topic_id"].astype(str)[::-1]
fig = plt.figure(figsize=(8,8))
ax1 = fig.add_subplot(121)
ax1.barh(x, y)
for i, v in enumerate(y):
    ax1.text(v + 3, i, str(v), fontsize='small')

y = df_by_topics["Num of window topics"][::-1]
x = df_by_topics["dynamic_topic_id"].astype(str)[::-1]
ax2 = fig.add_subplot(122)
ax2.barh(x, y)
for i, v in enumerate(y):
    ax2.text(v + 3, i-0.2, str(v), fontsize='small')
ax1.grid(axis='x', alpha=0.5)
ax2.grid(axis='x', alpha=0.5)
ax1.set_xlabel("Num of speeches")
ax2.set_xlabel("Num of window topics")
ax1.set_ylabel("Dynamic topic")
plt.show()

In [None]:
sp.stats.pearsonr(df_by_speeches["Num of speeches"], df_by_topics["Num of window topics"])

# Opposition

In [None]:
df_opposition = read_archive("opposition/")
print("Opposition", df_opposition.shape)
df_opposition.head()

In [None]:
df_opposition["quarter"].unique().shape

In [None]:
df_opposition[["quarter","window_topic_id"]].drop_duplicates().sort_values(by="window_topic_id").groupby("quarter")[["window_topic_id"]].size().reset_index(name='Num of topics')["Num of topics"].sum()

In [None]:
df_opposition_quarterly_dist_speech = df_opposition.groupby("quarter")[["quarter"]].size().reset_index(name='Num of speeches')
fig = plt.figure(figsize=(18,4))
ax = fig.add_subplot(111)
line1 = ax.plot(df_opposition_quarterly_dist_speech["quarter"], df_opposition_quarterly_dist_speech["Num of speeches"], alpha=0.5, label="Num of speeches")

df_opposition_quarterly_dist_topics = df_opposition[["quarter","window_topic_id"]].drop_duplicates().sort_values(by="window_topic_id").groupby("quarter")[["window_topic_id"]].size().reset_index(name='Num of topics')
ax2 = plt.twinx()
line2 = ax2.plot(df_opposition_quarterly_dist_topics["quarter"], df_opposition_quarterly_dist_topics["Num of topics"], c='g', label="Num of topics")

lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper center')
ax2.set_ylabel("Num of topics")

ax.grid(axis='y')
ax.set_xlabel("Quarter")
ax.set_ylabel("Num of speeches")
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.xaxis.set_major_locator(MultipleLocator(5))
plt.show()

In [None]:
sp.stats.pearsonr(df_opposition_quarterly_dist_topics["Num of topics"], df_opposition_quarterly_dist_speech["Num of speeches"])

In [None]:
df_opposition_quarterly_dist_topic_coherence = df_opposition[["quarter","window_topic_id", "window_topic_coherence"]].drop_duplicates()
df_opposition_quarterly_dist_topic_coherence = df_opposition_quarterly_dist_topic_coherence.groupby(["quarter"])[["window_topic_coherence"]].mean().reset_index()
fig = plt.figure(figsize=(18,3))
ax = fig.add_subplot(111)
ax.plot(df_opposition_quarterly_dist_topic_coherence["quarter"], df_opposition_quarterly_dist_topic_coherence["window_topic_coherence"])
ax.set_ylim([0, 0.6])
ax.set_xlabel("Quarter")
ax.set_ylabel("Coherence")
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.xaxis.set_major_locator(MultipleLocator(5))
plt.show()

In [None]:
df_opposition_quarterly_dist_topic_coherence["window_topic_coherence"].describe()

In [None]:
df_opposition_second_layer = df_opposition[["dynamic_topic_id","dynamic_topic_terms", "dynamic_topic_coherence", "quarter"]]\
.drop_duplicates(["dynamic_topic_id", "quarter"])\
.groupby(["dynamic_topic_id","dynamic_topic_terms", "dynamic_topic_coherence"])[["quarter"]]\
.size().reset_index(name='Freq').sort_values(by="dynamic_topic_coherence", ascending=False).reset_index(drop=True)
df_opposition_second_layer.to_csv("opposition_dt.csv")
df_opposition_second_layer

In [None]:
df_opposition_second_layer["dynamic_topic_coherence"].mean()

In [None]:
print("topic:", df_opposition[df_opposition["dynamic_topic_id"] == 9][["dynamic_topic_id","dynamic_topic_terms"]].drop_duplicates()["dynamic_topic_terms"].values[0])
df_dyn_topic = find_dynamic_topic(df_opposition, 9)
df_dyn_topic["window_topic_terms"] = df_dyn_topic["window_topic_terms"].map(lambda x: "\n".join(x.split()))
df_dyn_topic = df_dyn_topic.T
df_dyn_topic.head()

In [None]:
df_table = df_opposition[df_opposition["dynamic_topic_id"] == 9].groupby("quarter").size().reset_index(name="Number of speeches")
fig = plt.figure(figsize=(18,6))
ax = fig.add_subplot(111)
for label in ax.get_xticklabels():
    label.set_rotation(90)
    label.set_fontsize(8)
ax.bar(df_table["quarter"], df_table["Number of speeches"])
plt.show()

In [None]:
df_by_speeches = df_opposition.groupby(["dynamic_topic_id", "dynamic_topic_terms"])[["dynamic_topic_id"]].size().reset_index(name="Num of speeches").sort_values(by="Num of speeches", ascending=False).reset_index(drop=True)
df_by_speeches.head()

In [None]:
df_by_topics = df_opposition[["dynamic_topic_id", "dynamic_topic_terms", "window_topic_id"]].drop_duplicates()\
.groupby(["dynamic_topic_id", "dynamic_topic_terms"])\
[["dynamic_topic_id"]].size().reset_index(name="Num of window topics").sort_values(by="Num of window topics", ascending=False).reset_index(drop=True)
df_by_topics.head()

In [None]:
y = df_by_speeches["Num of speeches"][::-1]
x = df_by_speeches["dynamic_topic_id"].astype(str)[::-1]
fig = plt.figure(figsize=(8,8))
ax1 = fig.add_subplot(121)
ax1.barh(x, y)
for i, v in enumerate(y):
    ax1.text(v + 3, i, str(v), fontsize='small')

y = df_by_topics["Num of window topics"][::-1]
x = df_by_topics["dynamic_topic_id"].astype(str)[::-1]
ax2 = fig.add_subplot(122)
ax2.barh(x, y)
for i, v in enumerate(y):
    ax2.text(v + 3, i-0.2, str(v), fontsize='small')
ax1.grid(axis='x', alpha=0.5)
ax2.grid(axis='x', alpha=0.5)
ax1.set_xlabel("Num of speeches")
ax2.set_xlabel("Num of window topics")
ax1.set_ylabel("Dynamic topic")
plt.show()

# Framing Identification

* PAP: dynamic topic 3, "Legislative" relates to election
* Opposition: dynamic topic 9, "Election"

In [None]:
print("topic:", df_pap[df_pap["dynamic_topic_id"] == 3][["dynamic_topic_id","dynamic_topic_terms"]].drop_duplicates()["dynamic_topic_terms"].values[0])
df_dyn_topic = find_dynamic_topic(df_pap, 3)
df_dyn_topic["window_topic_terms"] = df_dyn_topic["window_topic_terms"].map(lambda x: "\n".join(x.split()))
df_dyn_topic = df_dyn_topic.T
df_dyn_topic.to_csv("pap_3.csv")
df_dyn_topic.head()

In [None]:
df_popular_subject_pap = df_pap[["quarter", "title", "window_topic_id", "window_topic_terms"]]
df_popular_subject_pap = df_popular_subject_pap.drop_duplicates(["quarter", "title", "window_topic_id"]).groupby(["quarter", "title"])["window_topic_id"].size().reset_index(name="num of topics").sort_values("num of topics", ascending=False)
df_popular_subject_pap = df_popular_subject_pap[df_popular_subject_pap["title"].str.contains("budget") == False]
df_popular_subject_pap = df_popular_subject_pap[df_popular_subject_pap["title"].str.contains("head") == False]
df_popular_subject_pap = df_popular_subject_pap[df_popular_subject_pap["title"].str.contains("bill") == False]
df_popular_subject_pap = df_popular_subject_pap[df_popular_subject_pap["title"].str.contains("president's address") == False]
df_popular_subject_pap = df_popular_subject_pap[df_popular_subject_pap["title"].str.contains("supplementary") == False]
df_popular_subject_pap.to_csv("a.csv")

In [None]:
title = "a sustainable population for a dynamic singapore"

In [None]:
df_opposition[["title", "window_topic_terms"]][df_opposition["title"] == title].drop_duplicates()["window_topic_terms"].values

In [None]:
df_pap[["title", "window_topic_terms"]][df_pap["title"] == title].drop_duplicates()["window_topic_terms"].values