In [None]:
%reload_ext autoreload
%autoreload 2

import os
import sys

if not os.path.exists(os.path.join(os.getcwd(), 'config.json')):
  # preventing to go always to the parent folder everytime we run the jupyter notebook without restarting
  os.chdir("..")
if not os.getcwd() in sys.path:
  sys.path.append(os.getcwd())

In [None]:
# Main config
import json
import csv

config = None
with open('config.json') as config_file:
  config = json.load(config_file)

config["database"]["database"] = "wikidump"
datavoids_per_topic = {}

available_topics = []

# open the contrasting_arguments_labeled_stats_selected.csv
contrasting_arguments_labeled_stats_selected = []
print('Selected topics')
with open('data/contrasting_arguments_labeled_stats_selected.csv', 'r') as f:
  reader = csv.DictReader(f)
  for r in reader:
    if int(r['Selected']) != 1:
      continue
    available_topics.append(r['Mitigator'] + ' vs ' + r['Disinformer'])
    print('  - ', r['Mitigator'], 'vs', r['Disinformer'])
    datavoids_per_topic[r['Mitigator'] + ' vs ' + r['Disinformer']] = {
      'mitigator': r['Mitigator'],
      'disinformer': r['Disinformer'],
      'mitigator_id': r['Mitigator_ID'],
      'disinformer_id': r['Disinformer_ID'],
      'labeled_mitigator': r['Labeled_Mitigator'],
      'labeled_disinformer': r['Labeled_Disinformer'],
      'datavoids': []
    }

In [None]:
# DO THIS ONLY TO DO AGAIN DATAVOID SERARCH
# THIS CODE TAKES DAYS
# RESULTS ARE SAVED IN data/nodes_with_no_datavoids.json

from datavoids.find_datavoids import find_datavoids
import pandas as pd
from analysis.analysis_commons import clone_config_with_target
from labeler.database_labeler import label_database
import time

for dv_title, dv_info in datavoids_per_topic.items():
  if dv_title != 'Direct Democracy vs Representative Democracy':
    continue
  # if dv_title == 'Declarative Language vs Procedural Language' \
  #   or dv_title == 'Bayesian Statistics vs Frequentist Statistics' \
  #   or dv_title == 'Optimism vs Pessimism':
  #   continue
  print(dv_info)
  print('Finding datavoids for', dv_title)
  config_temp = clone_config_with_target(config, 'mit', 'dis', dv_info['mitigator_id'], dv_info['disinformer_id'])
  label_database(
      config_temp, 
      filter_by_topk_words=3, 
      skip_labeling=False
    )

  res = find_datavoids(config_temp, 
        min_freq_a=0.01,
        min_freq_b=0.01, 
        max_freq_ungrp=0.001,
        ratio_k=4, # penalty of freq_A(w) be different than freq_B(w).
        ratio_t=2, # influence of the ungrouped frequency
        rewrite_datavoids=True
      )
  dv_info['datavoids'] = res

  df = pd.DataFrame(res, columns=['Gram', 'freq_A', 'freq_B', 'freq_U', 'ratio'])
  pd.set_option('display.max_rows', 100)
  print(df.head(100))

  with open('data/datavoids_per_topic.json', 'w') as f:
    json.dump(datavoids_per_topic, f, indent=2)

  time.sleep(5)

In [None]:
import pandas as pd

def filter_datavoids(topic, df, min_freq_grouped, max_freq_ungrouped, max_num=10):
  datavoids_data = datavoids_per_topic[topic]['datavoids']
  df = pd.DataFrame(datavoids_data, columns=['Gram', 'freq_A', 'freq_B', 'freq_U', 'ratio'])
  df = df[
      (df['freq_A'] >= min_freq_grouped) &
      (df['freq_B'] >= min_freq_grouped) &
      (df['freq_U'] <= max_freq_ungrouped)
  ]
  df = df.sort_values(by='ratio', ascending=False)
  df = df.head(max_num)
  return df

In [None]:
import json
from analysis.analysis_commons import clone_config_with_target, label_for_topic

pd.set_option('display.float_format', '{:.10f}'.format)

with open('data/datavoids_per_topic.json', 'r') as f:
  datavoids_per_topic = json.load(f)

filtered_datavoids_per_topic = json.loads(json.dumps(datavoids_per_topic))

for topic in datavoids_per_topic:
  if topic not in available_topics:
    del filtered_datavoids_per_topic[topic]
    continue
  print(topic)
  mitigator_id = int(datavoids_per_topic[topic]['mitigator_id'])
  disinformer_id = int(datavoids_per_topic[topic]['disinformer_id'])
  mitigator_label = label_for_topic(datavoids_per_topic[topic]['mitigator'])
  disinformer_label = label_for_topic(datavoids_per_topic[topic]['disinformer'])
  datavoids_data = datavoids_per_topic[topic]['datavoids']
  df = pd.DataFrame(datavoids_data, columns=['Gram', 'freq_A', 'freq_B', 'freq_U', 'ratio'])

  if datavoids_data is not None:
    print("  Datavoids available:", len(datavoids_data))

In [None]:
topic = "Declarative Language vs Procedural Language"
print(topic)

datavoids_data = datavoids_per_topic[topic]['datavoids']
df = filter_datavoids(topic, df, 0.25, 0.001)
print("After filtering:", len(datavoids_data) , "->", len(df)) 
filtered_datavoids_per_topic[topic]['datavoids'] = list(df['Gram'])
filtered_datavoids_per_topic[topic]['freq_A'] = list(df['freq_A'])
filtered_datavoids_per_topic[topic]['freq_B'] = list(df['freq_B'])
filtered_datavoids_per_topic[topic]['freq_U'] = list(df['freq_U'])
filtered_datavoids_per_topic[topic]['ratio'] = list(df['ratio'])
df

In [None]:
topic = "Optimism vs Pessimism"
print(topic)

datavoids_data = datavoids_per_topic[topic]['datavoids']
df = filter_datavoids(topic, df, 0.058, 0.001)
print("After filtering:", len(datavoids_data) , "->", len(df)) 
filtered_datavoids_per_topic[topic]['datavoids'] = list(df['Gram'])
filtered_datavoids_per_topic[topic]['freq_A'] = list(df['freq_A'])
filtered_datavoids_per_topic[topic]['freq_B'] = list(df['freq_B'])
filtered_datavoids_per_topic[topic]['freq_U'] = list(df['freq_U'])
filtered_datavoids_per_topic[topic]['ratio'] = list(df['ratio'])
df

In [None]:
topic = "Rationalism vs Empiricism"
print(topic)

datavoids_data = datavoids_per_topic[topic]['datavoids']
df = filter_datavoids(topic, df, 0.17, 0.001)
print("After filtering:", len(datavoids_data) , "->", len(df)) 
filtered_datavoids_per_topic[topic]['datavoids'] = list(df['Gram'])
filtered_datavoids_per_topic[topic]['freq_A'] = list(df['freq_A'])
filtered_datavoids_per_topic[topic]['freq_B'] = list(df['freq_B'])
filtered_datavoids_per_topic[topic]['freq_U'] = list(df['freq_U'])
filtered_datavoids_per_topic[topic]['ratio'] = list(df['ratio'])
df

In [None]:
topic = "Classical Economics vs Keynesian Economics"
print(topic)

datavoids_data = datavoids_per_topic[topic]['datavoids']
df = filter_datavoids(topic, df, 0.15, 0.001)
print("After filtering:", len(datavoids_data) , "->", len(df)) 
filtered_datavoids_per_topic[topic]['datavoids'] = list(df['Gram'])
filtered_datavoids_per_topic[topic]['freq_A'] = list(df['freq_A'])
filtered_datavoids_per_topic[topic]['freq_B'] = list(df['freq_B'])
filtered_datavoids_per_topic[topic]['freq_U'] = list(df['freq_U'])
filtered_datavoids_per_topic[topic]['ratio'] = list(df['ratio'])
df

In [None]:
topic = "Classical Economics vs Keynesian Economics"
print(topic)

datavoids_data = datavoids_per_topic[topic]['datavoids']
df = filter_datavoids(topic, df, 0.15, 0.001)
print("After filtering:", len(datavoids_data) , "->", len(df)) 
filtered_datavoids_per_topic[topic]['datavoids'] = list(df['Gram'])
filtered_datavoids_per_topic[topic]['freq_A'] = list(df['freq_A'])
filtered_datavoids_per_topic[topic]['freq_B'] = list(df['freq_B'])
filtered_datavoids_per_topic[topic]['freq_U'] = list(df['freq_U'])
filtered_datavoids_per_topic[topic]['ratio'] = list(df['ratio'])
df

In [None]:
with open('data/datavoids_per_topic_filtered.json', 'w') as f:
  json.dump(filtered_datavoids_per_topic, f, indent=2)

In [None]:
from analysis.analysis_commons import *

datavoids_summary = []

for topic in filtered_datavoids_per_topic:
  print(topic)
  mitigator_id = int(datavoids_per_topic[topic]['mitigator_id'])
  disinformer_id = int(datavoids_per_topic[topic]['disinformer_id'])
  mitigator_label = label_for_topic(datavoids_per_topic[topic]['mitigator'])
  disinformer_label = label_for_topic(datavoids_per_topic[topic]['disinformer'])
  datavoids = filtered_datavoids_per_topic[topic]['datavoids']
  num_datavoids = len(filtered_datavoids_per_topic[topic]['datavoids'])
  freq_keywords_in_A = sum(filtered_datavoids_per_topic[topic]['freq_A']) / num_datavoids
  freq_keywords_in_B = sum(filtered_datavoids_per_topic[topic]['freq_B']) / num_datavoids
  freq_keywords_in_ungrouped = sum(filtered_datavoids_per_topic[topic]['freq_U']) / num_datavoids
  ratio = sum(filtered_datavoids_per_topic[topic]['ratio']) / num_datavoids

  curr_config = clone_config_with_target(config, mitigator_label, disinformer_label, mitigator_id, disinformer_id)

  curr_config['datavoids'] = [ datavoids ]
  print("Datavoids:", curr_config['datavoids'])
  keyword = "|".join(curr_config["datavoids"][0])

  groups_count = label_database(curr_config, remove_multilabeled_nodes=False)
  labeled_A = groups_count[disinformer_label]
  labeled_B = groups_count[mitigator_label]

  graph = Graph(config)
  graph.connect()
  graph.cursor.execute("""
      select count(*) 
      from nodes natural join nodes_info 
      where grp is not null 
        and not content_vector @@ to_tsquery('english', %s);
  """, (keyword,))
  nodes_with_no_datavoids = graph.cursor.fetchone()[0]
  graph.cursor.execute("""
      select count(*) 
      from nodes 
      where grp is not null;
  """)
  total_labeled = graph.cursor.fetchone()[0]
  nodes_with_no_datavoids = nodes_with_no_datavoids / total_labeled

  datavoids_summary.append({
    "topic": topic,
    "keywords": ", ".join(curr_config["datavoids"][0]),
    "labeled_A": labeled_A,
    "labeled_B": labeled_B,
    "freq_keywords_in_A": freq_keywords_in_A,
    "freq_keywords_in_B": freq_keywords_in_B,
    "freq_keywords_in_ungrouped": freq_keywords_in_ungrouped,
    "ratio": ratio,
    "nodes_with_no_datavoids": nodes_with_no_datavoids
  })


In [None]:
datavoids_summary

In [None]:
from commons import *

latex_table = "\\begin{table*}[ht!]\n\\centering\n"
latex_table += "\\begin{tabular}{|p{0.1\\textwidth}|p{0.3\\textwidth}|p{0.1\\textwidth}|p{0.1\\textwidth}|p{0.1\\textwidth}|p{0.1\\textwidth}|p{0.1\\textwidth}|}\n\\hline\n"
latex_table += "Topic Name & Keywords & Labeled M & Labeled D & Keywords Freq. in M & Keywords Freq. in B & Keywords Freq. in U \\\\ \\hline\n"

for s in datavoids_summary:
    formatted_topic = [
      s["topic"],
      s["keywords"],
      str(s["labeled_A"]),
      str(s["labeled_B"]),
      float_to_string(s["freq_keywords_in_A"], precision=3),
      float_to_string(s["freq_keywords_in_B"], precision=3),
      float_to_string(s["freq_keywords_in_ungrouped"], precision=10)
    ]
    latex_table += " & ".join(formatted_topic) + " \\\\\n"

latex_table += "\\hline\n\\end{tabular}\n"
latex_table += "\\label{tab:webpagedataset}\n"
latex_table += "\\end{table*}"

print(latex_table)


\begin{table*}[ht!]
\centering
\begin{tabular}{|p{0.1\textwidth}|p{0.3\textwidth}|p{0.1\textwidth}|p{0.1\textwidth}|p{0.1\textwidth}|p{0.1\textwidth}|p{0.1\textwidth}|}
\hline
Topic Name & Keywords & Labeled M & Labeled D & Keywords Freq. in M & Keywords Freq. in B & Keywords Freq. in U \\ \hline
Declarative vs Procedural Language & lisp, semantics, javascript, xml, syntaxhighlight, runtime, syntax, implementations, compiler, imperative & 32 & 39 & 0.336 & 0.355 & 0.000568099 \\
Optimism vs Pessimism & nihilism, affective, pessimism, depressive, adolescents, innate, pessimistic, coping, psychologists, optimism & 119 & 133 & 0.161 & 0.201 & 0.0005016894 \\
Rationalism vs Empiricism & rationalism, descartes, leibniz, gottfried, philosophies, metaphysics, truths, metaphysical & 58 & 117 & 0.302 & 0.376 & 0.0004273569 \\
Classical vs Keynesian Economics & macroeconomics, keynesian, macroeconomic, laissez, maynard, keynes, faire, liberalism, economists, friedman & 240 & 90 & 0.319 & 0.33 & 0.0004937472 \\
\hline
\end{tabular}
\label{tab:webpagedataset}
\end{table*}

In [None]:
for s in datavoids_summary:
  print(s["topic"], s["nodes_with_no_datavoids"], precision=10)