# Find pattern matches
This notebook is for finding linguistic patterns in in the answer column. The patterns are stored in patterns.json. Simply input the csv file path and name, name the csv file that will be output, and run the cell.

In [13]:
import pandas as pd
import json
import spacy
from spacy.matcher import DependencyMatcher

# Add the file path and name for the csv
file = '../data/...csv'

# Read the csv into a df
df = pd.read_csv(file).drop(['Unnamed: 0'],axis=1)

# Open the pattern JSON file
with open("../data/patterns.json", "r") as file:
    patterns_dict = json.load(file)

# Load spacy and add the matcher
nlp = spacy.load("en_core_web_sm")
matcher = DependencyMatcher(nlp.vocab)

# Add the patterns to the matcher
for pattern_name, pattern_list in patterns_dict.items():
    matcher.add(f'{pattern_name}', [pattern_list])

# Get the pattern names
pattern_names = list(patterns_dict.keys())

# Add a column to the DataFrame for each pattern name
for pattern_name in pattern_names:
    df[pattern_name] = 0

# Define a function to count the patterns in the answers
def count_patterns(text, matcher):
    """Count the number of pattern matches in the text."""
    doc = nlp(text)
    matches = matcher(doc)
    counts = {pattern_name: 0 for pattern_name in patterns_dict.keys()}
    for match_id, token_ids in matches:
        pattern_name = matcher.vocab.strings[match_id]
        counts[pattern_name] += 1
    return counts
        
for index, row in df.iterrows():
    answer_text = row['answer']
    pattern_counts = count_patterns(answer_text, matcher)
    for pattern_name, count in pattern_counts.items():
        df.at[index, pattern_name] = count
        
# Write the new df to a csv
df.to_csv('../data/')