### Rule-Based String Detection using spaCy

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [34]:
def has_lang_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang', 'python', 'ruby', 'objective-c']:
            if t.pos_ != 'VERB':
                return True
    return False 
# this function works fine for single token strings like 'go' but fails to perform for languages like objective-c

In [35]:
doc = nlp('I am an iOS dev and I like to code in objective-c')

In [36]:
[t for t in doc]

[I, am, an, iOS, dev, and, I, like, to, code, in, objective, -, c]

In [37]:
has_lang_token(doc)

False

In [38]:
from spacy.matcher import Matcher 

In [39]:
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'}, # deals with the multi-token split
                  {'LOWER': 'c'}]

obj_c_pattern2 = [{'LOWER': 'objectivec'}]

golang_pattern1 = [{'LOWER': 'golang'}] 
golang_pattern2 = [{'LOWER': 'go', 
                    'POS': {'NOT_IN': ['VERB']}}]

python_pattern = [{'LOWER': 'python'}]
ruby_pattern   = [{'LOWER': 'ruby'}]
js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

# rule-based matching using custom-patterns, can exp more using spaCy docs

In [40]:
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("OBJ_C_LANG", None, obj_c_pattern1, obj_c_pattern2)
matcher.add("PYTHON_LANG", None, python_pattern)
matcher.add("GO_LANG", None, golang_pattern1, golang_pattern2)
matcher.add("JS_LANG", None, js_pattern)
matcher.add("RUBY_LANG", None, ruby_pattern)


In [41]:
matcher(doc)

[(4002319739860662978, 11, 14)]

In [42]:
doc[11: 14] # we see index 11: 14 in doc is actually objective-c

objective-c

### Multiple-Language Detection by looping through the object

In [43]:
doc = nlp('I am an iOS dev who codes in both go/golang as well as objective-c')
for match_id, start, end in matcher(doc):
    print(doc[start: end]) 

golang
objective-c


In [44]:
doc = nlp("I am an iOS dev who codes in all of python, go/golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

python
golang
objective-c


In [45]:
doc = nlp("I've done some js and ruby programming")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

js
ruby


### Dataset 
https://www.kaggle.com/datasets/stackoverflow/stacksample

In [46]:
import pandas as pd
df = (pd.read_csv("Questions.csv", nrows=1_000_000, 
                  encoding="ISO-8859-1", usecols=['Title', 'Id']))

In [47]:
titles = (_ for _ in df['Title'] if "python" in _.lower())


### Benchmarking

In [48]:
for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) == 0:
        print(doc) # o/p all titles containing matcher strings but that haven't been picked up due to semantics; satisfactory

mod_python/MySQL error on INSERT with a lot of data: "OperationalError: (2006, 'MySQL server has gone away')"
Running subversion under apache and mod_python
What's the best way to embed IronPython inside my C# App?
How to set the PYTHONPATH in Emacs?
wxPython wxDC object from win32gui.GetDC
Need skeleton code to call Excel VBA from PythonWin
Questions for python->scheme conversion
wxPython and sharing objects between windows
Django on IronPython
IronPython Webframework
A SuggestBox for wxPython?
Intercepting Method Access on the Host Program of IronPython
Is there anything like IPython / IRB for Perl?
