In [26]:
# import titles data

import pandas as pd
import spacy
from spacy import displacy
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable=['ner'])

df = (pd.read_csv("Questions.csv", nrows=1_000_000,
                    encoding="ISO-8859-1", usecols=['Title', 'Id']))
titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [27]:
# look at random titles to get a sense
import random 
random.choices(titles, k=20)

['Stop cursor from going to start of text field',
 'Google Play Developer Console : Some stats are missing',
 'S3 not serving media files to Django heroku app',
 'How to show 2 markers on a Google Map?',
 'Connect to Google Calendar API but get empty array of events',
 'Search Algorithm error regarding first index value',
 'wagon-ssh-1.0-beta-2.jar usage problem',
 'Need help creating Coldfusion App to Google Group API',
 'Indentation standards when writing django Application',
 'One django instance one database vs different django instances',
 'Archives page in Django',
 'Google Map Algorithm (Ajax, Tiles, etc)',
 'Getting results with association that belongs only to a certain list of categories',
 'Weird Segfaults Go Away After Full Rebuild',
 'Creating a regular expression in django',
 'Got response from server is this response correct If so How to parse that response?',
 'google play services not supported for Maps',
 "How to copy and paste the value or text from Google search box

In [28]:
# Quickly trying string match
def has_golang(doc):
    doc=nlp(doc)
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ != 'VERB':
                if t.dep_ == 'pobj':
                    return True
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range (20)]

## basic string matching will not help us here. There is some inherent meaning for 'Go' as a token that cannot be understood through a string match.

[How do I disable multiple listboxes in one go using jQuery?,
 Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go,
 Trouble reading from a socket in go,
 Convert string to integer type in Go?,
 Implementing the âdeferâ statement from Go in Objective-C?,
 what's the state of go language IDE support?,
 Global Variables with GO,
 Generating Random Numbers in Go,
 making generic algorithms in go,
 In Go, one type is coerced into another, can a method to determine the type of the receiver?,
 The maximum value for an int type in Go,
 Do Sets exist in Go? (like in Python),
 Usage of interface in Go,
 How do I retrieve file data over a socket in Go?,
 How do you create a new instance of a struct from it's Type at runtime in Go?,
 Is it possible to make extensions to python/php/perl with Go?]

In [29]:
df_tags = pd.read_csv("Tags.csv")
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                 .loc[lambda d: ~d['Id'].isin(go_ids)]
                 .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                 ['Title']
                 .tolist())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1167, 762, 1345)

In [30]:
model_name = "en_core_web_sm"
model = spacy.load(model_name, disable=['ner'])

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ != 'VERB':
                if t.dep_ == 'pobj':
                    return True
    return False

method = "not-verb-but-pobj"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct / (correct + wrong)
recall = correct / len(detectable)
accuracy = (correct + len(non_detectable) - wrong) / (len(detectable) + len(non_detectable))

f"model: {model_name}, method {method}, precision: {precision:.2f}, recall: {recall:.2f}, accuracy: {accuracy:.2f}"

'model: en_core_web_sm, method not-verb-but-pobj, precision: 0.93, recall: 0.39, accuracy: 0.77'

In [32]:
obj_c_pattern = [{'LOWER': 'objective'},
                 {'IS_PUNCT': True},
                 {'LOWER': 'c'}]

golang_pattern = [{'LOWER': {'IN': ['go', 'golang']},
                   'POS': ['NOT_IN', 'VERB']}]

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

matcher.add("OBJ_C_LANG", None, obj_c_pattern)
matcher.add("GOLANG_LANG", None, golang_pattern)

TypeError: add() takes exactly 2 positional arguments (3 given)

In [None]:
spacy.explain("pobj")
type(all_go_sentences)




list