In [1]:
rhyper_multi_patterns = [
    [ # !(features|properties) Y such as X1 , X2 , . . .
        {"LOWER": {"IN": ["features", "properties"]}, "OP":"!"},
        {"ENT_TYPE": "TECH"},
        {"ORTH": ",", "OP": '?'},
        {"LOWER": "such"},
        {"LOWER": "as"},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"}
    ],
    [ # Y including X1 , X2 , . . .
        {"ENT_TYPE": "TECH"},
        {"ORTH": ",", "OP": '?'},
        {"LOWER": "including"},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"} 
    ]
]

rhyper_single_patterns = [
    [ # (Unlike|like) (most|all|any|other) Y, X
        {"LOWER": {"IN": ["unlike", "like"]}},
        {"LOWER": {"IN": ["most", "all", "any", "other"]}},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"},
        {"ORTH": ","},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"}
    ]
]

hyper_single_patterns = [
    [ # which is a (example|class|kind|. . . ) of Y
        {"ENT_TYPE": "TECH"},
        {"LOWER": "which"},
        {"LOWER": {"IN": ["is", "are"]}},
        {"LOWER": {"IN": ["a", "an"]}},
        {"LOWER": {"IN": ["example", "class","kind"]}},
        {"LOWER": "of"},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"}
    ],
    [ # X (and|or) (any|some) other Y
        {"ENT_TYPE": "TECH"},
        {"LOWER": {"IN": ["and", "or"]}},
        {"LOWER": {"IN": ["any", "some"]}},
        {"LOWER": "other"},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"}
    ],
    [ # X which is called Y
        {"ENT_TYPE": "TECH"},
        {"ORTH": ",", "OP": '?'},
        {"LOWER": "which"},
        {"LOWER": {"IN": ["is", "are"]}},
        {"LOWER": {"IN": ["also", "sometimes"]}, "OP":"?"},
        {"LOWER": "called"},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"}
    ],
    [ # X a special case of Y
        {"ENT_TYPE": "TECH"},
        {"LOWER": "a"},
        {"LOWER": "special"},
        {"LOWER": "case"},
        {"LOWER": "of"},
        {"LOWER": {"IN": ["a", "an","the"]}, "OP":'?'},
        {"ENT_TYPE": "TECH"}
    ],
    [ # X is an Y that
        {"ENT_TYPE": "TECH"},
        {"LOWER": {"IN": ["is", "are"]}},
        {"LOWER": {"IN": ["a", "an","the"]}},
        {"ENT_TYPE": "TECH"},
        {"LOWER": "that"},  
    ],
    [ # X is a !(member|part|given) Y
        {"ENT_TYPE": "TECH"},
        {"LOWER": {"IN": ["is", "are"]}},
        {"LOWER": {"IN": ["a", "an","the"]}},
        {"ENT_TYPE": "TECH"},
    ],
    

]

In [27]:
import spacy
from spacy.matcher import Matcher
from random import shuffle
import pandas as pd
from tqdm import tqdm 
import time
import sys

# this turns on the autotimer, so that every cell has a timing information below
try:
    %load_ext autotime
except:
    !pip install ipython-autotime
    %load_ext autotime
# stop using:
# %unload_ext autotime

time: 16 ms (started: 2022-04-20 23:34:21 +02:00)


In [39]:
class Hearst_Patterns:
    def __init__(self, patterns, model_path="spacy/model-new", text_path="G06K.txt"):

        g06k = open(text_path).read().strip()
        self.patent_lines = g06k.split('\n')
        
        self.nlp = spacy.load(model_path)
        self.en_nlp = spacy.load("en_core_web_lg")

        self.nlp.add_pipe("merge_entities")
        self.en_nlp.add_pipe('merge_noun_chunks')

        self.matcher = Matcher(self.nlp.vocab)
        self.continue_words = [',','and','or',';','also']

        for name, pattern in patterns:
            self.matcher.add(name, pattern)

    def extract_patterns(self, size=10, save_folder=".", start=0):
        extraced_patterns = []
        line = start
        count = 0
        print(f'{count} pattern extracted...', end='\r')
        sys.stdout.flush()
        try:
            while count<size:
                while True:
                    patterns = hp.get_matches(self.patent_lines[line])
                    line += 1
                    if patterns:
                        extraced_patterns += patterns
                        break
                    print(f'{count} patterns extracted...{line}', end='\r')
                    sys.stdout.flush()
                    
                count = len(extraced_patterns)
                print(f'{count} patterns extracted...{line}', end='\r')
                sys.stdout.flush()
        except:
            print("An error has occurred")

        print(f'({count}) patterns extracted from lines ({start}-{line}))')
        save_file = f"{save_folder}/hearst_patterns.{len(extraced_patterns)}.csv"
        print(f'Patterns saved to {save_file}')
        df = pd.DataFrame(extraced_patterns, columns =['word1', 'word2', 'relation', 'label', 'text'])
        df.to_csv(save_file)

    def get_matches(self, text):
        label = {
            'rhyper':-1,
            'hyper':1,
        }
        doc = self.nlp('. '+text)
        matches = self.matcher(doc)
        relations = []
        for match_id, start, end in matches:
            ent_indices = [i for i in range(start,end) if doc[i].text in [ent.text for ent in doc[start:end].ents]]
            if not ent_indices:
                return []

            span = doc[min(ent_indices):max(ent_indices)+1]

            match_info = self.nlp.vocab.strings[match_id]  # Get string representation
            match_name = match_info.split('-')[0]
            match_type = match_info.split('-')[1]

            np_0 = span[0]
            np_1 = span[-1]
            right_terms = [np_1.text]
            if match_type=="multi":
                doc_parser = self.en_nlp(doc[end:].text)
                for d in doc_parser:
                    matching_ents = [ent.text for ent in doc.ents if ent.text in d.text]
                    if matching_ents:
                        right_terms.append(matching_ents[0])
                    elif d.text not in self.continue_words:
                        break

            for term in right_terms:
                relations.append((np_0.text, term, match_name, label[match_name], text))

        relations = set(relations)
        return list(relations)


patterns = [ ("rhyper-multi", rhyper_multi_patterns), ("hyper-single", hyper_single_patterns), ("rhyper-single", rhyper_single_patterns) ]
hp = Hearst_Patterns(patterns, model_path="../spacy/model-new", text_path="../G06K.txt")

time: 6.17 s (started: 2022-04-21 00:04:43 +02:00)


In [108]:
hp.extract_patterns(size=50, start=5896)

time: 0 ns (started: 2022-04-21 02:13:14 +02:00)


last line processed 26331

In [110]:
import pandas as pd

df = pd.read_csv("hearst_patterns/formatted_hearst_patterns.155.csv")
df.drop(columns=['Unnamed: 0'],inplace=True)

time: 16 ms (started: 2022-04-21 02:15:24 +02:00)


In [113]:
df.to_csv("hearst_patterns/formatted_hearst_patterns.155.csv")

time: 0 ns (started: 2022-04-21 02:16:25 +02:00)
