# Document Term Matrix

This sample script will create a document-term matrix from a sample of your dataset (if it exceeds 10000 documents) using scikit-learn's matrix factorization funcationality. The resulting matrix is then turned into a dataframe which can be exported to a .csv file.

In [1]:
# Importing our required libraries
import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import random

# scikit-learn library will generate our document-term matrix
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# importing csv file
org = pd.read_csv("/home/ec2-user/SageMaker/Halfmann_Abortion/organizations.csv")
org_list = org["Unnamed: 0"].tolist()

In [5]:
# Replace the path below with the dataset which you would like to use as input for the script
# dataset_directory = 'directory_of_your_dataset'
dataset_directory = '/home/ec2-user/SageMaker/data/Halfmann_Abortion'
input_files = os.listdir(dataset_directory)

In [6]:
# Topic modeling, in this case matrix factorization can require a lot of memory, and the memory usage of both processing and the resulting 
# dataframe can negatively impact your TDM Studio experience. For this reason, we take a sample of the 
# documents if there are too many documents in the dataset.
# try:
#    sample_input_files = random.sample(input_files, 10000)
    
# except ValueError:
#    sample_input_files = input_files
sample_input_files = input_files

In [7]:
# We define a function to get the text content that we need from the XML articles available in our dataset
def getxmlcontent(root):
    if root.find('.//HiddenText') is not None:
                return(root.find('.//HiddenText').text)zz
        return None

In [8]:
tree = etree.parse(dataset_directory+"/"+sample_input_files[0])
root = tree.getroot()

In [14]:
article_text = {}

article_id = root.find("GOID").text
article_text[article_id] = {}
article_text[article_id]["text"] = BeautifulSoup(getxmlcontent(root)).get_text()

In [19]:
article_text[article_id]["text"]

str

In [52]:
root[1][4].text

'1968-04-29'

In [53]:
root.findall("./Obj/NumericDate")
for elem in root.iter("NumericDate"):
    print(elem.text)

1968-04-29


In [28]:
# to check the content of the article
for elem in root.iter():
    print(elem.text)


   
118200528

      

         
article
Feature
Article
Articles - All Types

         
TV's Quiet Revolution: Censors Giving In

         
Once - Taboo Topics Are Common in Today's Fare Industry Says Shift Reflects a More Liberal Society
1968-04-29
1968-04-29
1968-04-29
Apr 29, 1968
1
       
         
English
       
         
         
            
ENG
English

         
Copyright New York Times Company Apr 29, 1968
       
         
86
       
         

            
89132240

            
1517381

            
43896
       
         

            

               
By ROBERT E. DALLOS

                  
By ROBERT E. DALLOS

         

            

          <html>
            <head>
              <meta name="ValidationSchema" content="http://www.w3.org/2002/08/xhtml/xhtml1-strict.xsd"/>
              <title/>
            </head>
            <body>
              <p>
These lines from recent television programs would not have passed the censors a few years back. Today such risque 

## Full Title: "Title" + "SubTitle"

In [69]:
root.findall("./Title")
for elem in root.iter("Title"):
    print(elem.text)

TV's Quiet Revolution: Censors Giving In
New York Times  (1923-Current file)
New York Times  (1923-Current file)


In [71]:
# Creating a empty dictionary in which to store key/value pairs, we can convert this to a dataframe with ease later
article_text = {}

# Parse files and create dictionary of text
for file in sample_input_files:
    try:
        tree = etree.parse(dataset_directory + "/" + file)
        root = tree.getroot()
        
    except:
        print(file)
        
    try:
        # We are finding just text from each article
        article_id = root.find('GOID').text
        article_text[article_id] = {}
        article_text[article_id]['text'] = BeautifulSoup(getxmlcontent(root)).get_text()
        # get the date
        for elem in root.iter("NumericDate"):
            article_text[article_id]["DATE"] = elem.text
        
    except AttributeError:
        print('Attribute Error' + file)
        continue
    except TypeError:
        print('Type Error' + file)
        continue

Type Error848064441.xml
Type Error120040807.xml


In [72]:
# Converting dictionary to dataframe, and dropping any rows with no text value
df_text = pd.DataFrame.from_dict(article_text, orient='index')
df_text.dropna()

Unnamed: 0,text,DATE
118200528,\n\n\n\n\n\n\nTV's Quiet Revolution: Censors G...,1968-04-29
109085647,"\n\n\n\n\n\n\nHouse, Like Senate, Votes To Ban...",1993-03-12
2463412533,\n\n\n \n\n\n\nCecil Andrus Preservationist In...,2017-08-26
110844154,\n\n\n\n\n\n\nO'Connor Says He'll Drop Some Fo...,1987-01-24
108962491,\n\n\n\n\n\n\nCoalition Government Falls Apart...,1992-11-05
...,...,...
118177575,\n\n\n\n\n\n\nTelevision This Week\n\n\n. . . ...,1967-02-26
108510026,\n\n\n\n\n\n\nPUBLIC & PRIVATE\n\n\nPUBLIC & P...,1990-09-16
118956856,\n\n\n\n\n\n\nABORTION REFORM DEBATED BY A.M.A...,1970-06-23
1621335386,\n\n\n \n\n\n\nFor years doctors 'pregnancy ca...,2011-08-14


In [73]:
# Getting document-term matrix using scikit-learn's CountVectorizer
vectorizer = CountVectorizer(min_df = .03, stop_words = 'english', vocabulary = org_list)

# Feeding in list of all text
document_term_matrix = vectorizer.fit_transform(df_text['text'].tolist())

# Getting words 
features = vectorizer.get_feature_names()

# Converting to dataframe
df_document_term = pd.DataFrame(document_term_matrix.todense(), columns=features)

In [74]:
df_document_term

Unnamed: 0,Senate,Congress,House,The New York Times,Court,the Supreme Court,the White House,Supreme Court,State,Medicaid,...,the House of Represent,gon,SUSAN F. RASKY Special,Pace University,Christian Broadcasting Network,Maugham,La Stampa,Environmental Protection Agency,Condoms,Pacific Fertility
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# Getting a list of goids
GOIDs = df_text.index.tolist()

# Adding goids to document-term matrix
df_document_term['GOIDs'] = GOIDs

# Setting goids as our index
df_document_term.set_index('GOIDs', drop=True, inplace=True)

# Getting a list of dates
dates = df_text["DATE"].tolist()

df_document_term["DATE"] = dates

In [76]:
# Taking a look at the first 20 rows in dataframe
df_document_term.head(20)

Unnamed: 0_level_0,Senate,Congress,House,The New York Times,Court,the Supreme Court,the White House,Supreme Court,State,Medicaid,...,gon,SUSAN F. RASKY Special,Pace University,Christian Broadcasting Network,Maugham,La Stampa,Environmental Protection Agency,Condoms,Pacific Fertility,DATE
GOIDs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
118200528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1968-04-29
109085647,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1993-03-12
2463412533,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,2017-08-26
110844154,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1987-01-24
108962491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1992-11-05
118731251,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1970-04-09
109868892,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1998-08-05
109470667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995-12-24
117734580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1967-08-12
108601588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1990-02-11


In [77]:
# Run this cell to convert your document-term matrix into a csv file
# WARNING: This operation may take some time
df_document_term.to_csv('../document_term_matrix_ner_WITHDATES.csv')

In [78]:
import numpy as np
sections = np.array_split(df_document_term,20)
for i in range(20):
    sections[i].to_csv('../' + "withdates" + str(i) + '.csv')