This code more efficiently generates tf-idf scores by using Scikit-Learn

In [1]:
import requests
# import json
# import math
import re
# import nltk
import bz2
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk.download('stopwords')
# from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load and Clean Data

In [2]:
# Sample workset

# workset = {
#     "osu.32435051242188": "Truman",
#     "osu.32437000791067": "Hoover",
#     "uc1.31210014062952": "Clinton",
#     "osu.32435057459778": "Kennedy",
#     "osu.32435030026934": "Nixon",
# }

In [3]:
def post_workset(payload):
  ''' Post a workset to the HTRC Extracted Features API and return the workset ID '''
  url = "https://data.htrc.illinois.edu/ef-api/worksets"

  headers = {
      "Content-Type": "text/plain",
      "Accept": "application/json"
  }

  response = requests.post(url, data=payload, headers=headers)
  response_json = response.json()

  workset_id = response_json['data']['id']

  return workset_id

In [4]:
def get_workset(workset_id, save_data=False, file_name='USA Presidential Papers'):
  '''Uses the HTRC Extracted Features API to retrieve volume data for a workset ID.'''
  api_get_volume = "https://data.htrc.illinois.edu/ef-api/worksets/{}/volumes/aggregated".format(workset_id)
  
  response = requests.get(api_get_volume)
  workset_data = response.json()

  if save_data and workset_data["code"] == 200:
    with bz2.BZ2File(file_name + '.pbz2', 'wb') as f:
      pickle.dump(workset_data, f)
    
  if workset_data["code"] == 200:
    print("Successfully retrieved data from API")
    return workset_data
  else:
    print(f'Failed to retrieve data: {workset_data["code"]}')
    return None

In [5]:
def get_workset_volumes_aggregated(workset_data):
  '''Extracts the volume IDs and feature data from the workset data'''
  workset_total_wc = {} # {volume_id:{word:count}}
  for volume in workset_data['data']:
    workset_total_wc['{}'.format(volume_id)] = volume['features']['body']

  return workset_total_wc

In [6]:
def load_workset_data(file_name='USA Presidential Papers.pbz2'):
  '''Loads the workset data from a file'''
  with bz2.BZ2File(file_name, 'rb') as f:
    workset_data = pickle.load(f)
  
  return workset_data

In [7]:
# workset_id = post_workset(workset.keys())
# workset_data = get_workset('664e9a5938000014012eeda9', save_data=False)

workset_data = load_workset_data('USA Presidential Papers.pbz2')

workset_wc = get_workset_volumes_aggregated(workset_data)

In [8]:
# Combine word counts for total workset and clean data
wc_dict = {}
for volume in workset_wc.keys():
  wc_dict[volume] = {}
  for term in workset_wc[volume].keys():
    lower_term = term.casefold() # Convert to lowercase for case-insensitive comparison. More aggressive than .lower()
    match = re.search(r'[^a-z]', lower_term) # Matches any non-alphabetic character. This excludes words with hyphens, apostrophes, etc. such as "zero-sum"
    if match is not None: # Skip if regex match
      continue
    if lower_term in wc_dict[volume].keys():
      wc_dict[volume][lower_term] += workset_wc[volume][term]
    else:
      wc_dict[volume][lower_term] = workset_wc[volume][term]

# Convert dict to list for adding to dataframe
volume_id_list = []
term_list = []
count_list = []
for volume in wc_dict.keys():
  for term in wc_dict[volume].keys():
    volume_id_list.append(volume)
    term_list.append(term)
    count_list.append(wc_dict[volume][term])
corpus_df = pd.DataFrame({'term': term_list, 'volume_id': volume_id_list, 'count': count_list})

# Drop terms with less than 5 occurrences. These are likely OCR errors.
corpus_df = corpus_df[corpus_df['count'] >= 5]

In [9]:
corpus_df.sort_values(by='term', ascending=False)

Unnamed: 0,term,volume_id,count
8864296,zz,uiug.30112005184087,9
5008145,zyuganov,uc1.31210011533856,5
2427113,zyuganov,osu.32435057271629,5
9741321,zyuganov,uiug.30112031999763,5
4398413,zyuganov,osu.32437010507941,5
...,...,...,...
6085188,a,uc1.31210013217334,12469
6597607,a,uc1.31210021238876,11504
1652341,a,osu.32435030026611,10012
4173500,a,osu.32437000790994,9126


# Scikitlearn TF-IDF

In [10]:
# Produce a string of all text in each volume and store in list of strings workset_wc_concat
corpus_df['strings'] = (corpus_df['term'] + ' ')*corpus_df['count']

workset_wc_concat = []
for volume in corpus_df['volume_id'].unique():
    workset_wc_concat.append(' '.join(corpus_df[corpus_df['volume_id'] == volume]['strings']))

In [11]:
# Documentation for vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer = TfidfVectorizer(max_df=0.95,  # max document frequency. Words in more the x% will be ignored
                             min_df=.02, # min document frequency. Will ignore words that occur less than x times in the corpus
                             sublinear_tf=True, # use logarithmic scale for term frequency.
                             )

`Denselist` is a list of lists with each list corresponding to a volume. Each sub-list corresponds to the index of the `feature_names`, with each value in the sub-list being the tf-idf score for the corresponding word in the given volume. If a word in `feature_names` does not appear in the volume, it has a score of 0. 

In [12]:
# Vectorize and isolate keywords from documents
vectors = vectorizer.fit_transform(workset_wc_concat)
feature_names = vectorizer.get_feature_names_out()

dense = vectors.todense()
denselist = dense.tolist()

In [13]:
# Validation
print(len(denselist)) # This should be the same as the number of volumes in the workset
print(corpus_df['volume_id'].unique()[0]) # This should be a volume_id in the workset

653
ien.35556003701398


In [74]:
# See tf-idf scores for a given volume
pd.DataFrame({"term": feature_names, "tf-idf":denselist[1]}).sort_values(by='tf-idf', ascending=False).head(15)

Unnamed: 0,term,tf-idf
9614,reconversion,0.086422
5809,ianuary,0.072668
6359,iuly,0.072035
6360,iune,0.072018
8359,opa,0.071682
12370,unrra,0.07113
6296,iohn,0.069717
3119,demobilization,0.068821
8664,pauley,0.068446
4357,factfinding,0.067811


In [14]:
# Create a dataframe from the denselist

dict_from_denselist = {i: item for i, item in enumerate(denselist)}
tfidf_df = pd.DataFrame.from_dict(dict_from_denselist)
tfidf_df.columns = corpus_df['volume_id'].unique()
tfidf_df.index = feature_names

In [15]:
tfidf_df.shape

(16719, 653)

In [16]:
# Save the tfidf_df to a compressed csv
tfidf_df.to_csv('USA Presidential Papers tf-idf.csv.zip', compression='zip')