# Document Term Matrix

This sample script will create a document-term matrix from a sample of your dataset (if it exceeds 10000 documents) using scikit-learn's matrix factorization funcationality. The resulting matrix is then turned into a dataframe which can be exported to a .csv file.

In [1]:
# Importing our required libraries
import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import random

# scikit-learn library will generate our document-term matrix
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Replace the path below with the dataset which you would like to use as input for the script
# dataset_directory = 'directory_of_your_dataset'
dataset_directory = '/home/ec2-user/SageMaker/data/Halfmann_Abortion'
input_files = os.listdir(dataset_directory)

In [4]:
# Topic modeling, in this case matrix factorization can require a lot of memory, and the memory usage of both processing and the resulting 
# dataframe can negatively impact your TDM Studio experience. For this reason, we take a sample of the 
# documents if there are too many documents in the dataset.
# try:
#    sample_input_files = random.sample(input_files, 10000)
    
# except ValueError:
#    sample_input_files = input_files
sample_input_files = input_files

In [5]:
# We define a function to get the text content that we need from the XML articles available in our dataset
def getxmlcontent(root):
    if root.find('.//HiddenText') is not None:
        return(root.find('.//HiddenText').text)
    elif root.find('.//Text') is not None:
        return(root.find('.//Text').text)
    else:
        return None

In [7]:
# Creating a empty dictionary in which to store key/value pairs, we can convert this to a dataframe with ease later
article_text = {}

# Parse files and create dictionary of text
for file in sample_input_files:
    try:
        tree = etree.parse(dataset_directory + "/" + file)
        root = tree.getroot()
        
    except:
        print(file)
        
    try:
        # We are finding just text from each article
        article_id = root.find('GOID').text
        article_text[article_id] = {}
        article_text[article_id]['text'] = BeautifulSoup(getxmlcontent(root)).get_text()
        
    except AttributeError:
        print('Attribute Error' + file)
        continue
    except TypeError:
        print('Type Error' + file)
        continue

Type Error848064441.xml
Type Error120040807.xml


In [8]:
# Converting dictionary to dataframe, and dropping any rows with no text value
df_text = pd.DataFrame.from_dict(article_text, orient='index')
df_text.dropna()

Unnamed: 0,text
1029866795,\n\n\n\n\n\n\nOpposition From Right and Left\n...
1029867267,\n\n\n\n\n\n\nPrescriptions\n\n\nPrescriptions...
1029867287,\n\n\n\n\n\n\nPriest's Son Who Told All Dies a...
1029867639,\n\n\n\n\n\n\nPrescriptions\n\n\nPrescriptions...
1029867641,\n\n\n\n\n\n\nLouisiana Republican Breaks Rank...
...,...
93282529,\n\n\n\n\n\n\nPulling Rank\n\n\nBy John R Lott...
93283115,"\n\n\n\n\n\n\n,Defving Party Leaders, Suozzi S..."
93285300,\n\n\n\n\n\n\nDAVID CARR\n\n\nDAVID CARR\n\n\n...
93285771,"\n\n\n\n\n\n\nA ConservativeEvolves,\n\n\nAndL..."


In [15]:
# Getting document-term matrix using scikit-learn's CountVectorizer
vectorizer = CountVectorizer(min_df = .03, stop_words = 'english')

# Feeding in list of all text
document_term_matrix = vectorizer.fit_transform(df_text['text'].tolist())

# Getting words 
features = vectorizer.get_feature_names()

# Converting to dataframe
df_document_term = pd.DataFrame(document_term_matrix.todense(), columns=features)

In [17]:
# Getting a list of goids
GOIDs = df_text.index.tolist()

# Adding goids to document-term matrix
df_document_term['GOIDs'] = GOIDs

# Setting goids as our index
df_document_term.set_index('GOIDs', drop=True, inplace=True)

In [18]:
# Taking a look at the first 20 rows in dataframe
df_document_term.head(20)

Unnamed: 0_level_0,000,10,100,11,12,13,14,15,150,16,...,wrong,wrote,year,years,yes,yesterday,york,young,younger,youth
GOIDs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1029866795,1,1,0,1,1,0,0,0,0,0,...,0,0,5,1,0,0,2,0,0,0
1029867267,0,1,0,0,0,0,0,0,0,0,...,0,0,2,0,1,0,2,0,0,0
1029867287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,2,0,0,0
1029867639,2,0,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,0,0,0
1029867641,0,0,0,0,0,0,0,1,0,0,...,0,1,5,0,0,0,1,0,0,0
1029867708,0,0,0,0,0,0,0,0,0,0,...,0,3,1,1,0,0,1,1,0,0
1029867767,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
1029868200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1029868298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1029877006,0,0,0,0,0,0,1,0,0,0,...,0,0,3,1,0,0,2,1,0,0


In [19]:
# Run this cell to convert your document-term matrix into a csv file
# WARNING: This operation may take some time
df_document_term.to_csv('../document_term_matrix.csv')