# Developing Letter Deviation-Based Features for the CER Prediction Model

This notebook will create several new features related to deviations in letter count for usage in the CER prediction model.

In [1]:
import pandas as pd
import os
from collections import Counter
import numpy as np

In [34]:
# Load document data
df = pd.read_csv("C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/readable_v5.csv")

In [5]:
df.head()

Unnamed: 0,file,lex_ocr,fk_ocr,flesch_ocr,lex_gold,fk_gold,flesch_gold,CER,percent_misspelled,count_thes,...,count_fr,count_nle,count_ef,count_drv,count_va,count_buddhist,count_su,percent_alphabetic,percent_numeric,percent_punctuation
0,GALE_CK2349346194.txt,0.561769,4.628641,82.944254,0.446995,14.917718,36.174668,0.486949,25.955414,1,...,0,0,1,0,0,0,1,66.065496,3.607024,9.871856
1,GALE_CK2349347998.txt,0.482604,8.833292,64.280919,0.357855,15.745466,33.331225,0.212453,16.981132,1,...,0,0,0,0,0,0,0,73.832162,1.012815,7.895825
2,GALE_CK2349354090.txt,0.395423,12.930099,38.5738,0.341304,14.649109,29.873484,0.062577,5.730028,1,...,0,0,0,0,0,0,0,79.619317,0.698196,3.640595
3,GALE_CK2349354764.txt,0.497585,5.614244,82.566375,0.247114,8.993146,57.297364,0.362739,35.954344,1,...,1,0,0,0,0,0,1,65.915521,6.56831,6.901778
4,GALE_CK2349355800.txt,0.708661,4.309904,76.668466,0.578313,13.401156,33.573502,0.230804,27.987421,1,...,0,0,0,0,0,0,0,71.580289,1.869159,8.708581


In [6]:
# Directory containing documents
doc_dir = "C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/ddo"

In [30]:
# Load the expected letter frequencies
expected_frequencies_df = pd.read_csv('C:\\Users\\larak\\OneDrive\\Documents\\GitHub\\HL-Spring-24\\analyses\\expected_letter_distribution.csv')
expected_frequencies = dict(zip(expected_frequencies_df['letter'], expected_frequencies_df['percent_freq']))

In [14]:
expected_frequencies

{'e': 0.1116,
 'a': 0.08496,
 'r': 0.0758,
 'i': 0.07544,
 'o': 0.07163,
 't': 0.0695,
 'n': 0.06654,
 's': 0.05735,
 'l': 0.05489,
 'c': 0.04538,
 'u': 0.0363,
 'd': 0.03384,
 'p': 0.03167,
 'm': 0.03012,
 'h': 0.03003,
 'g': 0.0247,
 'b': 0.0207,
 'f': 0.01812,
 'y': 0.01777,
 'w': 0.01289,
 'k': 0.01101,
 'v': 0.01007,
 'x': 0.0029,
 'z': 0.00272,
 'j': 0.00196,
 'q': 0.00196}

In [12]:
def calculate_character_frequencies(text):
    total_chars = sum(c.isalpha() for c in text)
    char_counts = Counter(c for c in text.lower() if c.isalpha())
    frequencies = {letter: count / total_chars if total_chars else 0 for letter, count in char_counts.items()}
    return frequencies

In [15]:
# Define the letter pairs
letter_pairs = [('c', 'e'), ('i', 'l'), ('o', 'a'), ('v', 'u'), ('s', 'z'),
                ('g', 'q'), ('b', 'h'), ('n', 'm'), ('r', 'n'), ('d', 'b')]

In [22]:
def calculate_squared_deviations(observed_frequencies):
    return sum((observed_frequencies.get(letter, 0) - expected_frequencies.get(letter, 0)) ** 2 for letter in expected_frequencies)

def calculate_absolute_deviations(observed_frequencies):
    return sum(abs(observed_frequencies.get(letter, 0) - expected_frequencies.get(letter, 0)) for letter in expected_frequencies)

def calculate_paired_analysis(observed_frequencies, paired_letters):
    squared_paired_analysis = 0

    for letter1, letter2 in paired_letters:
        # get the observed frequency for each letter
        freq1 = observed_frequencies.get(letter1, 0)
        freq2 = observed_frequencies.get(letter2, 0)

        # retrieve expected frequencies from the dictionary
        expected_freq1 = expected_frequencies.get(letter1, 0)
        expected_freq2 = expected_frequencies.get(letter2, 0)

        # calculate the absolute deviation from the expected frequency
        deviation1 = abs(freq1 - expected_freq1)
        deviation2 = abs(freq2 - expected_freq2)

        # square the sum of absolute deviations and add to the total squared paired analysis
        squared_paired_analysis += (deviation1 + deviation2) ** 2

    return squared_paired_analysis

features = []
for filename in df['file']:
    filepath = os.path.join('C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/ddo', filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        observed_frequencies = calculate_character_frequencies(text)
        squared_dev = calculate_squared_deviations(observed_frequencies)
        abs_dev = calculate_absolute_deviations(observed_frequencies)
        paired_score = calculate_paired_analysis(observed_frequencies, letter_pairs)
        features.append((squared_dev, abs_dev, paired_score))

In [35]:
# Add features to DataFrame
df['squared_letter_devs'], df['absolute_letter_devs'], df['substitution_hhi'] = zip(*features)

In [36]:
df

Unnamed: 0,file,lex_ocr,fk_ocr,flesch_ocr,lex_gold,fk_gold,flesch_gold,CER,percent_misspelled,count_thes,...,count_drv,count_va,count_buddhist,count_su,percent_alphabetic,percent_numeric,percent_punctuation,squared_letter_devs,absolute_letter_devs,substitution_hhi
0,GALE_CK2349346194.txt,0.561769,4.628641,82.944254,0.446995,14.917718,36.174668,0.486949,25.955414,1,...,0,0,0,1,66.065496,3.607024,9.871856,0.002616,0.184889,0.001836
1,GALE_CK2349347998.txt,0.482604,8.833292,64.280919,0.357855,15.745466,33.331225,0.212453,16.981132,1,...,0,0,0,0,73.832162,1.012815,7.895825,0.004004,0.209771,0.002886
2,GALE_CK2349354090.txt,0.395423,12.930099,38.573800,0.341304,14.649109,29.873484,0.062577,5.730028,1,...,0,0,0,0,79.619317,0.698196,3.640595,0.003797,0.231380,0.003848
3,GALE_CK2349354764.txt,0.497585,5.614244,82.566375,0.247114,8.993146,57.297364,0.362739,35.954344,1,...,0,0,0,1,65.915521,6.568310,6.901778,0.004556,0.270927,0.006674
4,GALE_CK2349355800.txt,0.708661,4.309904,76.668466,0.578313,13.401156,33.573502,0.230804,27.987421,1,...,0,0,0,0,71.580289,1.869159,8.708581,0.002436,0.182133,0.003126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,GALE_CK2349543544.txt,0.635616,8.894206,58.788655,0.587500,9.489297,59.974125,0.215506,8.206687,1,...,0,0,0,0,73.627845,2.588130,7.987506,0.001736,0.166182,0.002337
183,GALE_CK2349549783.txt,0.599455,12.626183,44.283024,0.576441,12.090331,46.437895,0.144635,4.829545,1,...,0,0,0,0,79.442971,0.486295,3.404067,0.002076,0.186388,0.002666
184,GALE_CK2349566695.txt,0.577465,12.138952,48.940389,0.370395,10.607638,48.356076,0.312703,27.250608,2,...,0,0,0,0,78.170732,1.512195,4.097561,0.002699,0.179589,0.002500
185,GALE_CK2349567370.txt,0.692857,8.586829,62.009890,0.505800,17.256996,23.874506,0.379322,31.908832,1,...,0,0,0,0,70.191182,1.794772,10.846664,0.002347,0.159251,0.001480


In [37]:
# Save to new CSV
df.to_csv("C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/readable_v5.csv", index=False)