# Testing `text-matcher` Hyperparameters


Parameters to test:  
  -t, --threshold INTEGER    The shortest length of match to include in the
                             list of initial matches.

  -c, --cutoff INTEGER       The shortest length of match to include in the
                             final list of extended matches.

  -n, --ngrams INTEGER       The ngram n-value to match against.
  
  -m, --mindistance INTEGER  The minimum value for distance between two match.
  
  -l, --logfile TEXT         The name of the log file to write to.


`myMatch = Matcher(textObjA, textObjB, threshold=threshold, cutoff=cutoff, ngramSize=ngrams,removeStopwords=stops, minDistance=mindistance, silent=silent)`

In [None]:
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

### Install `text-matcher`

In [None]:
!pip3 install --user text-matcher

## Import libraries

In [1]:
from text_matcher.matcher import Text, Matcher
import json
import pandas as pd
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]

### Defining filepaths for the text matcher

In the current directory we're working in, I have addd a few files that we can try and run.
We've imported our text matcher, `matcher`, and now we're going to define two text files for the matcher to process by opening and reading text files in our directory.

In [70]:
# Load our JSTOR data 
with open('../../part-1.jsonl') as f: 
    rawCriticism = f.readlines()

# Parse the JSTORdata. 
data = [json.loads(line) for line in rawCriticism]

# Load our Middlemarch text file
with open('../middlemarch.txt') as f: 
    rawMM = f.read()

mm = Text(rawMM, 'Middlemarch', removeStopwords=False)

In [3]:
mm

<text_matcher.matcher.Text at 0x7f902b645460>

In [71]:
for i, article in enumerate(data): 
    clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['fullText'], article['id'], removeStopwords=False)
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(mm, articleText, \
                threshold=3, cutoff=4, ngramSize=3, \
                removeStopwords=False, minDistance=5).match()

 Matching article 5883 of 5884

In [72]:
# Write output somewhere. 
filepath = '../../../Middlematch/hyperparameter-data/t3-c4-n3-m5-with-stops.json'
with open(filepath, 'w') as outfile: 
    json.dump(data, outfile)

In [73]:
# Remove the full text
df = pd.read_json(filepath)
df.drop('fullText', axis=1, inplace=True)
df.to_json(filepath)

## Assessing Matches

In [10]:
default = pd.read_json('../../../Middlematch/hyperparameter-data/default.json')

t3c4n3m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t3-c4-n3-m8-with-stops.json')

t3c5n2m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t3-c5-n2-m8-with-stops.json')

t3c4n2m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t3-c4-n2-m8-with-stops.json')

t3c3n2m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t3-c3-n2-m8-with-stops.json')

t3c3n3m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t3-c3-n3-m8-with-stops.json')

t3c5n3m1 = pd.read_json('../../../Middlematch/hyperparameter-data/t3-c5-n3-m1-with-stops.json')

t2c5n3m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t2-c5-n3-m8-with-stops.json')

t2c5n3m1 = pd.read_json('../../../Middlematch/hyperparameter-data/t2-c5-n3-m1-with-stops.json')

t2c4n3m1 = pd.read_json('../../../Middlematch/hyperparameter-data/t2-c4-n3-m1-with-stops.json')

t2c3n2m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t2-c3-n2-m8-with-stops.json')

t2c3n2m1 = pd.read_json('../../../Middlematch/hyperparameter-data/t2-c3-n2-m1-with-stops.json')

t2c5n3m8 = pd.read_json('../../../Middlematch/hyperparameter-data/t2-c5-n3-m8-with-stops.json')


---

### For the default settings

In [12]:
articlesWithMatches_default = default[default['Locations in A'].apply(lambda x: len(x) > 0)]
articlesWithMatches_default.describe()

Unnamed: 0,pageCount,publicationYear,wordCount,numMatches
count,925.0,925.0,925.0,925.0
mean,17.135135,1992.260541,7975.380541,2.543784
std,12.675013,18.738177,6109.061051,3.672095
min,1.0,1902.0,469.0,1.0
25%,9.0,1981.0,4328.0,1.0
50%,16.0,1995.0,7546.0,1.0
75%,22.0,2007.0,10478.0,2.0
max,202.0,2022.0,109286.0,39.0


## For different iterations

In [13]:
df_name = t3c3n2m8

articlesWithMatches_df_name = df_name[df_name['Locations in A'].apply(lambda x: len(x) > 0)]
articlesWithMatches_df_name.describe()
#articlesWithMatches_df_name.describe().to_csv('descriptive-stats-on-hyperparameter-match-numbers/t3c5n3m1.csv')

Unnamed: 0,pageCount,publicationYear,wordCount,numMatches
count,1091.0,1091.0,1091.0,1091.0
mean,16.870761,1992.224565,7849.823098,2.633364
std,13.681259,19.162654,6338.232454,4.02234
min,1.0,1902.0,469.0,1.0
25%,9.0,1981.0,3963.5,1.0
50%,16.0,1995.0,7357.0,1.0
75%,22.0,2007.5,10389.0,2.0
max,202.0,2022.0,109286.0,42.0


In [16]:
articlesWithMatches_df_name['datePublished'].value_counts()

2016-01-01    12
2014-01-01    10
2000-01-01    10
1983-01-01     9
1993-09-01     9
              ..
1967-09-01     1
1982-02-01     1
2017-10-01     1
1984-11-01     1
1955-03-01     1
Name: datePublished, Length: 487, dtype: int64