<a href="https://colab.research.google.com/github/jacomyma/mapping-controversies/blob/main/notebooks/Words_and_documents_with_text_to_document_list_with_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🍒 Words and documents with text to document list with words

**Inputs:**
* a list of documents with their text content (CSV)
* a small list of words, like a dozen (CSV)

**Outputs:**
* a list of documents with words as columns (CSV)
* a list of document-word pairs (CSV)
* a bipartite network of documents and words (GEXF)

This script tells you which words are in which documents. Each word becomes a column, that is why you want to only have a few of them. You may have many documents, though. Words can be expressions (e.g., named entities).

If you have many words and just want the network, check [this notebook](https://colab.research.google.com/github/jacomyma/mapping-controversies/blob/main/notebooks/Words_and_documents_with_text_to_network.ipynb).

## How to use

1. Put your input files in the same folder as the notebook
1. Edit the settings if needed. CHECK THE COLUMN NAMES!
1. Run all the cells
1. Take ALL the output files from the notebook folder

# SETTINGS

In [None]:
# Input file 1: documents
input_file_documents = "documents.csv"
# Which column contains the text?
documents_text_column = "Text"
# Which column contains the document name or ID?
documents_id_column = "Article"

# Input file 2: small list of words
input_file_words = "words-small-list.csv"
# Which column contains the words?
words_text_column = "text"

# Delete documents that contain none of the words?
discard_unrelated_documents = True

# Output files
output_file_documents = "documents-with-terms.csv"
output_file_pairs = "terms-and-documents.csv"
output_file_network = "terms-document-network.gexf"


# SCRIPT

### Install and import libraries
This notebook draws on existing code.
You can ignore the output.

In [None]:
# Install (if needed)
!pip install pandas
!pip install spacy
!pip install networkx

# Import
import csv
import pandas as pd
import networkx as nx

print("Done.")

### Read the input file 1 (documents)

In [None]:
doc_df = pd.read_csv(input_file_documents, quotechar='"', encoding='utf8', doublequote=True, quoting=csv.QUOTE_NONNUMERIC, dtype=object, keep_default_na=False)
print("Preview of the document list:")
doc_df

### Read the input file 2 (words)

In [None]:
word_df = pd.read_csv(input_file_words, quotechar='"', encoding='utf8', doublequote=True, quoting=csv.QUOTE_NONNUMERIC, dtype=object, keep_default_na=False)
print("Preview of the word list:")
word_df

### Wrangle the data

In [None]:
# Get a set of the words
words = set()
for index, row in word_df.iterrows():
  words.add(row[words_text_column])

# Init data for output
document_list = []
pair_list = []
network_doc_set = set()
network_word_set = set()
network_edge_list = []

# Search words in documents
for index, row in doc_df.iterrows():
  text = row[documents_text_column].lower()
  count_per_word = {}
  flag = False
  for word in words:
    count = text.count(word.lower())
    count_per_word[word] = count
    if count > 0:
      flag = True

  if flag or not discard_unrelated_documents:
    # output 1
    doc_new_row = {**row, **count_per_word}
    document_list.append(doc_new_row)
    # output 2
    for word in words:
      count = count_per_word[word]
      if count > 0:
        pair_new_row = {**row, 'term':word, 'term-count':count}
        pair_list.append(pair_new_row)
    # output 3
    doc_id = row[documents_id_column]
    network_doc_set.add(doc_id)
    for word in words:
      count = count_per_word[word]
      if count > 0:
        network_word_set.add(word)
        network_edge_list.append((doc_id,word,{"count":count}))



### Make output 1 (documents with words as columns)

In [None]:
output_doc_df = pd.DataFrame(document_list)
output_doc_df = output_doc_df.drop(columns=[documents_text_column])
print("Done.")
print("Preview of the document list:")
output_doc_df

### Make output 2 (document-word pairs)

In [None]:
output_pair_df = pd.DataFrame(pair_list)
output_pair_df = output_pair_df.drop(columns=[documents_text_column])
print("Done.")
print("Preview of the pair list:")
output_pair_df

### Save the CSVs

In [None]:
try:
  output_doc_df.to_csv(output_file_documents, index = False, encoding='utf-8')
except IOError:
  print("/!\ Error while writing the documents output file")

try:
  output_pair_df.to_csv(output_file_pairs, index = False, encoding='utf-8')
except IOError:
  print("/!\ Error while writing the pairs output file")
print("Done.")

### Make and save output 3 (network)

In [None]:
# Build the nodes
nodes = []
doc_df_no_text = doc_df.drop(columns=[documents_text_column])
for index, row in doc_df_no_text.iterrows():
  if row[documents_id_column] in network_doc_set:
    nodes.append((row[documents_id_column], {**row, 'label':row[documents_id_column], 'type':'document'}))

for index, row in word_df.iterrows():
  if row[words_text_column] in network_word_set:
    nodes.append((row[words_text_column], {**row, 'label':row[words_text_column], 'type':'term'}))

# Build edges
edges = network_edge_list

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)
nx.write_gexf(G, output_file_network)
print("Done.")