# Predicting Character Error Rate via ML Models

Data Preparation

In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from itertools import combinations

In [53]:
readable_csv_path = 'C:/Users/larak/OneDrive/Documents/History-Lab/ddo/OCR paper/readable_v3.csv'

In [54]:
df = pd.read_csv(readable_csv_path)

In [55]:
df.head()

Unnamed: 0.1,Unnamed: 0,file,lex_ocr,fk_ocr,flesch_ocr,lex_gold,fk_gold,flesch_gold,CER,percent_misspelled
0,0,GALE_CK2349346194.txt,0.561769,4.628641,82.944254,0.446995,14.917718,36.174668,0.486949,25.955414
1,1,GALE_CK2349347998.txt,0.482604,8.833292,64.280919,0.357855,15.745466,33.331225,0.212453,16.981132
2,2,GALE_CK2349354090.txt,0.395423,12.930099,38.5738,0.341304,14.649109,29.873484,0.062577,5.730028
3,3,GALE_CK2349354764.txt,0.497585,5.614244,82.566375,0.247114,8.993146,57.297364,0.362739,35.954344
4,4,GALE_CK2349355800.txt,0.708661,4.309904,76.668466,0.578313,13.401156,33.573502,0.230804,27.987421


In [10]:
X = df[['file', 'lex_ocr', 'fk_ocr', 'percent_misspelled']]
y = df['CER']

In [12]:
X.head()

Unnamed: 0,file,lex_ocr,fk_ocr,percent_misspelled
0,GALE_CK2349346194.txt,0.561769,4.628641,25.955414
1,GALE_CK2349347998.txt,0.482604,8.833292,16.981132
2,GALE_CK2349354090.txt,0.395423,12.930099,5.730028
3,GALE_CK2349354764.txt,0.497585,5.614244,35.954344
4,GALE_CK2349355800.txt,0.708661,4.309904,27.987421


In [64]:
def linear_regression_cv(df):
    
    # Define the features (X) and target (y)
    features = ['lex_ocr', 'fk_ocr', 'percent_misspelled', 'lex_diversity_misspelled_interaction']
    X = df[list(features)]
    y = df['CER']
    
    # Define a pipeline for scaling and linear regression
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('linear_regression', LinearRegression())
    ])
    
    # Define the cross-validation strategy
    cv = KFold(n_splits=5, shuffle=True, random_state=1)
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    
    # Calculate RMSE scores from the cross-validation scores
    rmse_scores = np.sqrt(-scores)
    
    print(f"Mean RMSE: {np.mean(rmse_scores):.4f} +/- {np.std(rmse_scores):.4f}")

In [65]:
linear_regression_cv(df)

Mean RMSE: 0.1217 +/- 0.0299


Testing Linear Regression, Decision Tree Regression

In [56]:
def create_interaction_term(df):
    df['lex_diversity_misspelled_interaction'] = df['fk_ocr'] * df['percent_misspelled']
    return df


In [57]:
def get_feature_combinations(df):
    """
    Generates all possible non-empty combinations of the given features.
    """
    features = ['lex_ocr', 'fk_ocr', 'percent_misspelled', 'lex_diversity_misspelled_interaction']
    combo_list = []
    for r in range(1, len(features) + 1):
        combo_list.extend(combinations(features, r))
    return combo_list

In [66]:
def model_evaluation(df, model_type, params, feature_combo):
    X = df[list(feature_combo)]
    y = df['CER']

    preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), list(feature_combo))])
    
    cv = KFold(n_splits=5, random_state=1, shuffle=True)
    
    if model_type == 'linear':
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
        best_score = np.sqrt(-scores.mean())
        best_params = {}

    elif model_type == 'decision_tree':
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', DecisionTreeRegressor(random_state=1))])
        grid_search = GridSearchCV(pipeline, param_grid=params, cv=cv, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        best_score = np.sqrt(-grid_search.best_score_)
        best_params = grid_search.best_params_
    
    return best_score, best_params, feature_combo, model_type

In [76]:
def top_three(model_evaluations):
    top_models = sorted(model_evaluations, key=lambda x: x['score'])[:3]
    
    print("Top 3 Models:")
    for idx, model_details in enumerate(top_models, start=1):
        print(f"Model {idx}:")
        print(f"  Model Type: {model_details['model_type']}")
        print(f"  Features: {', '.join(model_details['features'])}")
        print(f"  Parameters: {model_details['params']}")
        print(f"  RMSE: {model_details['score']:.4f}\n")

In [80]:
def main(df):
    df = create_interaction_term(df)
    
    hyperparams = {
        'linear': [{}],
        'decision_tree': [
            {
                'model__max_depth': [None, 5, 10, 20],
                'model__min_samples_split': [2, 5, 10],
                'model__min_samples_leaf': [1, 2, 4]
            }
        ]
    }
    
    features = ['lex_ocr', 'fk_ocr', 'flesch_ocr', 'percent_misspelled', 'lex_diversity_misspelled_interaction']
    model_evaluations = []
    
    for model_type, param_list in hyperparams.items():
        for r in range(1, len(features) + 1):
            for combo in combinations(features, r):
                for params in param_list:
                    score, best_params, _, _ = model_evaluation(df, model_type, params, combo)
                    model_evaluations.append({
                        'model_type': model_type,
                        'features': combo,
                        'params': best_params,
                        'score': score
                    })
                    
    top_three(model_evaluations)
    return

In [81]:
main(df)

# L1 regularization to see importance of features

Top 3 Models:
Model 1:
  Model Type: linear
  Features: fk_ocr, flesch_ocr, percent_misspelled
  Parameters: {}
  RMSE: 0.1084

Model 2:
  Model Type: linear
  Features: fk_ocr, flesch_ocr, percent_misspelled, lex_diversity_misspelled_interaction
  Parameters: {}
  RMSE: 0.1087

Model 3:
  Model Type: linear
  Features: lex_ocr, fk_ocr, flesch_ocr, percent_misspelled
  Parameters: {}
  RMSE: 0.1092

