In [None]:
import os
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

In [None]:
#defining matching pattern:
def define_matcher():
    # We are -> The company is
    pattern_0 = [
        {'LOWER': 'we'}, 
        {'LOWER': 'are'}
    ]

    # We were -> The company was
    pattern_1 = [
        {'LOWER': 'we'}, 
        {'LOWER': 'were'}
    ]

    # We have -> The company has
    pattern_2 = [
        {'LOWER': 'we'}, 
        {'LOWER': 'have'}
    ]

    # We had -> The company had
    pattern_3 = [
        {'LOWER': 'we'}, 
        {'LOWER': 'had'}
    ]

    # We did -> The company did
    pattern_4 = [
        {'LOWER': 'we'}, 
        {'LOWER': 'did'}
    ]

    # We believe -> The company believes
    pattern_5 = [
        {'LOWER': 'we'}, 
        {'POS': 'VERB', 'TAG': 'VBP', 'LEMMA': {'NOT_IN': ['have']}}
    ]

    # We believed -> The company believed
    pattern_6 = [
        {'LOWER': 'we'}, 
        {'POS': 'VERB', 'TAG': 'VBD', 'LEMMA': {'NOT_IN': ['have']}}
    ]

    # Our debt/current -> The company's debt/current
    pattern_7 = [
        {'LOWER': 'our'}, 
        {'TAG': {'IN': ['NN', 'NNS', 'NNP', 'JJ', 'RB', 'VBN', 'VBG']}}
    ]

    # We may -> The company may
    pattern_8 = [
        {'LOWER': 'we'}, 
        {'TAG': 'MD'}
    ]


    matcher = Matcher(nlp.vocab)
    matcher.add('PAT_0', [pattern_0])
    matcher.add('PAT_1', [pattern_1])
    matcher.add('PAT_2', [pattern_2])
    matcher.add('PAT_3', [pattern_3])
    matcher.add('PAT_4', [pattern_4])
    matcher.add('PAT_5', [pattern_5])
    matcher.add('PAT_6', [pattern_6])
    matcher.add('PAT_7', [pattern_7])
    matcher.add('PAT_8', [pattern_8])

    
    return matcher


#using the matcher - processing text:
def process_text(txt):
    
    doc = nlp(txt)
    matcher = define_matcher()

    matches = matcher(doc)
    replacement_dict = dict()
    for match_id, start, end in matches:
      span = doc[start:end]
      # TODO for pattern 0 (We are -> The company is)
      if nlp.vocab.strings[match_id] == 'PAT_0':
        if span.text[0].islower():
            replacement_dict[span.text] = 'the company ' + 'is'
        else:
          replacement_dict[span.text] = 'The company ' + 'is'
      # TODO for pattern 1 (We were -> The company was)
      elif nlp.vocab.strings[match_id] == 'PAT_1':
        if span.text[0].islower():
            replacement_dict[span.text] = 'the company ' + 'was'
        else:
          replacement_dict[span.text] = 'The company ' + 'was'
      # TODO for pattern 2 (We have -> The company has)
      elif nlp.vocab.strings[match_id] == 'PAT_2':
        if span.text[0].islower():
            replacement_dict[span.text] = 'the company ' + 'has'
        else:
          replacement_dict[span.text] = 'The company ' + 'has'
      # TODO for pattern 3 (We had -> The company had)
      elif nlp.vocab.strings[match_id] == 'PAT_3':
        if span.text[0].islower():
            replacement_dict[span.text] = 'the company ' + 'had'
        else:
          replacement_dict[span.text] = 'The company ' + 'had'
      # TODO for pattern 4 (We did -> The company did), pattern 6 (We believed -> The company believed) and pattern 8 (We may -> The company may)
      elif nlp.vocab.strings[match_id] in ['PAT_4', 'PAT_6', 'PAT_8']:
        if span.text[0].islower():
            replacement_dict[span.text] = 'the company ' + doc[end-1].text
        else:
          replacement_dict[span.text] = 'The company ' + doc[end-1].text
      # TODO for pattern 5 (We believe -> The company believes)
      elif nlp.vocab.strings[match_id] == 'PAT_5':
        if span.text[0].islower():
            replacement_dict[span.text] = 'the company ' + doc[end-1].lemma_ + 's'
        else:
          replacement_dict[span.text] = 'The company ' + doc[end-1].lemma_ + 's'
      # TODO for pattern 7 (Our debt/current -> The company's debt/current)
      elif nlp.vocab.strings[match_id] == 'PAT_7':
        if span.text[0].islower():
            replacement_dict[span.text] = "the company's " + doc[end-1].text
        else:
          replacement_dict[span.text] = "The company's " + doc[end-1].text

        
    processed_txt = txt[:]

    for key, value in replacement_dict.items():
      processed_txt = processed_txt.replace(key, value)
    
    return processed_txt



In [None]:
source_file_dir = '/Users/dmitrybaron/Desktop/MIDS/W210_Capstone/Final_output'
dest_file_dir = '/Users/dmitrybaron/Desktop/MIDS/W210_Capstone/processed_folder'

list_source_file_names = os.listdir(source_file_dir)

for source_file_name in list_source_file_names:
    source_file_path = source_file_dir + '/' + source_file_name
    with open(source_file_path, 'r') as file:
        source_text = file.read()
    
    processed_text = process_text(source_text)
    

    processed_file_path = dest_file_dir + '/' + source_file_name
    with open(processed_file_path, 'w') as file:
        file.write(processed_text)
        