<a href="https://colab.research.google.com/github/jacomyma/mapping-controversies/blob/main/notebooks/Words_and_documents_with_text_to_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🍇 Words and documents with text to network

**Inputs:**
* a list of documents with their text content (CSV)
* a list of words (CSV)

**Outputs:**
* a bipartite network of documents and words (GEXF)
* a list of the documents, and how many of the words are in them, as columns (CSV)

This script tells you which words are in which documents. Words can be expressions (e.g., named entities).

## How to use

1. Put your input files in the same folder as the notebook
1. Edit the settings if needed. CHECK THE COLUMN NAMES!
1. Run all the cells
1. Take the output file from the notebook folder

# SETTINGS

In [None]:
# Input file 1: documents
input_file_documents = "documents.csv"
# Which column contains the text?
documents_text_column = "Text"
# Which column contains the document name or ID?
documents_id_column = "Article"

# Input file 2: small list of words
input_file_words = "words.csv"
# Which column contains the words?
words_text_column = "text"

# Delete documents that contain none of the words?
discard_unrelated_documents = True

# Output file
output_file_network = "terms-document-network.gexf"

# SCRIPT

### Install and import libraries
This notebook draws on existing code.
You can ignore the output.

In [None]:
# Install (if needed)
!pip install pandas
!pip install spacy
!pip install networkx

# Import
import csv
import pandas as pd
import networkx as nx

print("Done.")

### Read the input file 1 (documents)

In [None]:
doc_df = pd.read_csv(input_file_documents, quotechar='"', encoding='utf8', doublequote=True, quoting=csv.QUOTE_NONNUMERIC, dtype=object)
print("Preview of the document list:")
doc_df

### Read the input file 2 (words)

In [None]:
word_df = pd.read_csv(input_file_words, quotechar='"', encoding='utf8', doublequote=True, quoting=csv.QUOTE_NONNUMERIC, dtype=object)
print("Preview of the word list:")
word_df

### Wrangle the data

In [None]:
# Get a set of the words
words = set()
for index, row in word_df.iterrows():
  words.add(row[words_text_column])

# Init data for output
network_doc_set = set()
network_word_set = set()
network_edge_list = []

# Search words in documents
for index, row in doc_df.iterrows():
  text = row[documents_text_column].lower()
  count_per_word = {}
  flag = False
  for word in words:
    count = text.count(word.lower())
    count_per_word[word] = count
    if count > 0:
      flag = True

  if flag or not discard_unrelated_documents:
    doc_id = row[documents_id_column]
    network_doc_set.add(doc_id)
    for word in words:
      count = count_per_word[word]
      if count > 0:
        network_word_set.add(word)
        network_edge_list.append((doc_id,word,{"count":count}))



### Make and save network

In [None]:
# Build the nodes
nodes = []
doc_df_no_text = doc_df.drop(columns=[documents_text_column]) 
for index, row in doc_df_no_text.iterrows():
  if row[documents_id_column] in network_doc_set:
    nodes.append((row[documents_id_column], {**row, 'label':row[documents_id_column], 'type':'document'}))

for index, row in word_df.iterrows():
  if row[words_text_column] in network_word_set:
    nodes.append((row[words_text_column], {**row, 'label':row[words_text_column], 'type':'term'}))

# Build edges
edges = network_edge_list

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)
nx.write_gexf(G, output_file_network)
print("Done.")

### Make and save occurrences list

In [None]:
column = ['Id']

for word in words:
  column.append(word)
word_occurences = pd.DataFrame(columns = column)

# Build rows index
rows = dict()
for each in network_edge_list:
  row = {'Id':each[0]}
  for word in words:
    row[word] = 0
  rows[each[0]] = row

for each in network_edge_list:
  row = rows[each[0]]
  for word in words:
    if word == each[1]:
      row.update({word:each[2]['count']})
word_occurences = word_occurences.append(list(rows.values()), ignore_index=True)
word_occurences.to_csv('word_occurences.csv', index=False)