# Misspelling Count Feature Engineering for the CER Prediction Model

Goals:
1) Identify the 150 most frequent misspellings.
2) Count misspelling by document.
3) Add to CSV.

In [11]:
# Imports
import spacy
import pandas as pd
import os
from spellchecker import SpellChecker

In [17]:
nlp = spacy.load('en_core_web_sm')

In [16]:
ddo_path = 'C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/ddo'
readable_csv_path = 'C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/readable_v3.csv'
save_path = 'C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/readable_v4.csv'

In [18]:
def preprocess_ocr(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    ocr_text = "".join(lines[2:])
    return ocr_text

In [123]:
# Get counts of misspellings for both the overall corpus and a document

def calculate_count_misspelled(text: str, overall_counts_dict: dict):

    doc = nlp(text)
    spell = SpellChecker()

    misspelled_counts_dict = {}

    # for efficiency: spell-check each word as we iterate through doc
    # add any misspellings to the count for the overall dictionary representing the corpus
    # add any misspellings to the count for the document's dictionary
    # return both

    for token in doc:
        tok = token.text.lower()
        if spell[token.text] == False and tok.isalpha():
            if tok not in misspelled_counts_dict.keys():
                misspelled_counts_dict[tok] = 1
            else:
                misspelled_counts_dict[tok] += 1
            
            if tok not in overall_counts_dict.keys():
                overall_counts_dict[tok] = 1
            else:
                overall_counts_dict[tok] += 1
    
    overall_counts_dict = dict(sorted(overall_counts_dict.items(), key = lambda item: item[1], reverse=True))
    misspelled_counts_dict = dict(sorted(misspelled_counts_dict.items(), key = lambda item: item[1], reverse=True))

    return (misspelled_counts_dict, overall_counts_dict)

In [111]:
# Sample Usage
filename = "C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/ddo/GALE_CK2349005650.txt"
calculate_count_misspelled(preprocess_ocr(filename), {})[0]

{'atcmic': 4,
 'nov': 2,
 'june': 1,
 'determizuation': 1,
 'wa': 1,
 'ummvering': 1,
 'veapons': 1,
 'governmant': 1,
 'detision': 1,
 'governmnt': 1,
 'thes': 1,
 'opsition': 1,
 'testse': 1,
 'pvoposes': 1,
 'immediat': 1,
 'vspons': 1,
 'azma': 1,
 'ents': 1,
 'azw': 1,
 'pmoposed': 1,
 'uspemsion': 1,
 'mould': 1,
 'atanic': 1,
 'aaments': 1,
 'comu': 1,
 'oleg': 1,
 'pveparations': 1,
 'tjme': 1,
 'atietof': 1,
 'igomn': 1,
 'vespows': 1,
 'fo': 1,
 've': 1,
 'estabment': 1,
 'intraoa': 1,
 'comission': 1,
 'fulfilmn': 1,
 'afease': 1,
 'hydrosen': 1,
 'camilsion': 1,
 'mhy': 1,
 'emd': 1,
 'oeneral': 1,
 'govermmut': 1,
 'establishmwb': 1,
 'vniant': 1,
 'mdted': 1,
 'kuzgdom': 1,
 'fulfilimmt': 1,
 'vespons': 1,
 'xrmote': 1,
 'arement': 1,
 'teking': 1,
 'towftd': 1,
 'subc': 1,
 'swaeral': 1,
 'atamic': 1,
 'imdertake': 1,
 'vith': 1,
 'governmentso': 1}

In [122]:
def get_dicts(readable_csv_path, ddo_path):
    df = pd.read_csv(readable_csv_path)
    overall_misspelled_dict = dict() # keys: words, values: counts. Represents misspellings across the entire corpus.
    bydocument_misspelled_dict = dict() # keys: filenames, values: dicts of word counts. Stores misspellings by document.
    
    for filename in df['file']:
        file_path = os.path.join(ddo_path, filename)
        if os.path.exists(file_path):
            ocr_text = preprocess_ocr(file_path)
            misspelled_counts_dict, overall_misspelled_dict = calculate_count_misspelled(ocr_text, overall_counts_dict=overall_misspelled_dict)
            bydocument_misspelled_dict[filename] = misspelled_counts_dict

        else:
            print(f"File {filename} not found in {ddo_path}.")
    
    return overall_misspelled_dict, bydocument_misspelled_dict

overall_misspelled_dict, bydocument_misspelled_dict = get_dicts(readable_csv_path, ddo_path)
print(overall_misspelled_dict)
print(bydocument_misspelled_dict)



In [140]:
bydocument_misspelled_dict['GALE_CK2349346194.txt']['te']

2

In [114]:
# Save the dict of misspellngs to a .pickle

import pickle

with open('corpus_misspelled_counts.pickle', 'wb') as handle:
    pickle.dump(overall_misspelled_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [134]:
# Get the top 150 most common misspellings
top150 = list(overall_misspelled_dict.keys())[:150]
print(top150)

['thes', 'th', 'te', 'ft', 'chinese', 'mr', 'co', 'al', 'wa', 'british', 'oe', 'ot', 'tbe', 'cc', 'ar', 'aw', 'ia', 'sec', 'nato', 'lbj', 'ed', 'nsc', 'tha', 'nd', 'mm', 'viet', 'st', 'ii', 'ussr', 've', 'gvn', 'ing', 'tion', 'september', 'se', 'mw', 'lao', 'vith', 'ae', 'un', 'sm', 'nt', 'il', 'vietnamese', 'amd', 'ith', 'april', 'tm', 'ca', 'bo', 'md', 'hanoi', 'tim', 'june', 'american', 'thi', 'ir', 'washington', 'â', 'korean', 'eo', 'july', 'im', 'wilson', 'ea', 'sa', 'aa', 'tt', 'ws', 'europe', 'mt', 'ou', 'le', 'ie', 'february', 'ee', 'fo', 'cm', 'november', 'ba', 'ri', 'ol', 'latin', 'pr', 'tw', 'na', 'october', 'ent', 'tb', 'br', 'nam', 'mo', 'os', 'io', 'vere', 'january', 'ao', 'ml', 'af', 'japanese', 'ment', 'ly', 'tr', 'bas', 'cr', 'wi', 'sw', 'koreans', 'ci', 'ww', 'ih', 'wt', 'ac', 'ap', 'wil', 'di', 'acheson', 'ther', 'au', 'december', 'ame', 'ht', 'pm', 'dw', 'tht', 'ro', 'vi', 'lt', 'ur', 'ths', 'diem', 'lu', 'mao', 'ec', 'vould', 'mrs', 'israeli', 'ns', 'ew', 'ayub', '

We've determined the top 150 most frequent misspellings across the corpus, and gotten the by-document distributions. Now, add their counts by document to the CSV.

In [151]:
def add_feature_to_csv(readable_csv_path, ddo_path):

    df = pd.read_csv(readable_csv_path)
    bydocument_misspelled_dict = get_dicts(readable_csv_path, ddo_path)[1]
    existing_files = {filename: os.path.join(ddo_path, filename) 
                      for filename in df['file'] if os.path.exists(os.path.join(ddo_path, filename))}

    for misspelling in top150:
        counts = []
        for filename, file_path in existing_files.items():
            
            if os.path.exists(file_path):                
                try:
                    count = bydocument_misspelled_dict[filename][misspelling]
                except:
                    count = 0
            else:
                print(f"File {filename} not found in {ddo_path}.")
            
            counts.append(count)
        
        df["count_" + misspelling] = counts
    
    return df

In [152]:
def main():
    updated_df = add_feature_to_csv(readable_csv_path, ddo_path)
    print(updated_df)
    updated_df.to_csv(save_path, index=False)
    print("Updated CSV saved to:", save_path)

In [153]:
main()

  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_" + misspelling] = counts
  df["count_

     Unnamed: 0                   file   lex_ocr     fk_ocr  flesch_ocr  \
0             0  GALE_CK2349346194.txt  0.561769   4.628641   82.944254   
1             1  GALE_CK2349347998.txt  0.482604   8.833292   64.280919   
2             2  GALE_CK2349354090.txt  0.395423  12.930099   38.573800   
3             3  GALE_CK2349354764.txt  0.497585   5.614244   82.566375   
4             4  GALE_CK2349355800.txt  0.708661   4.309904   76.668466   
..          ...                    ...       ...        ...         ...   
182         182  GALE_CK2349543544.txt  0.635616   8.894206   58.788655   
183         183  GALE_CK2349549783.txt  0.599455  12.626183   44.283024   
184         184  GALE_CK2349566695.txt  0.577465  12.138952   48.940389   
185         185  GALE_CK2349567370.txt  0.692857   8.586829   62.009890   
186         186  GALE_CK2349574679.txt  0.674121   5.783210   76.676944   

     lex_gold    fk_gold  flesch_gold       CER  percent_misspelled  ...  \
0    0.446995  14.91771

  df["count_" + misspelling] = counts
