In [43]:
# import titles data

import pandas as pd
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm", disable=['ner'])

df = (pd.read_csv("Questions.csv", nrows=1_000_000,
                    encoding="ISO-8859-1", usecols=['Title', 'Id']))
titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]


In [44]:
# look at random titles to get a sense
import random 
random.choices(titles, k=20)

['Is there a good framework for Java desktop applications?',
 'Google Earth plugins not found on this uri com.google.earth',
 'If a not good attribute is selected for decision tree, there is a consistent hypothesis here?',
 'How can I listen all incoming and outgoing data that goes to a Flash-base webchat?',
 'How to get LatLngBounds of feature polygon geometry in google maps v3?',
 'Change the website title in Google search',
 'How to extend Query object in mongoose',
 'I want to increase /decrease Flashlight brightness. I tried and failed wit do this. So the code and requirement goes below',
 'Using JQuery to get and display Categories and Sub Categories - ALL AJAX',
 "mongodb: how to add a new property for mongo's geolocation function",
 'Google Adwords API: Get operator (Add/Remove/Set) of SimpleMutateResult Operation',
 'Extract feature coordinates from SpatialPolygons and other sp classes',
 'form validation in django',
 'How to clone an element and insert it multiple times in on

In [49]:
# Quickly trying string match
def has_golang(doc):
    doc=nlp(doc)
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ != 'VERB':
                if t.dep_ == 'pobj':
                    return True
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range (20)]

## basic string matching will not help us here. There is some inherent meaning for 'Go' as a token that cannot be understood through a string match.

[How do I disable multiple listboxes in one go using jQuery?,
 Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go,
 Trouble reading from a socket in go,
 Convert string to integer type in Go?,
 Implementing the âdeferâ statement from Go in Objective-C?,
 what's the state of go language IDE support?,
 Global Variables with GO,
 Generating Random Numbers in Go,
 making generic algorithms in go,
 In Go, one type is coerced into another, can a method to determine the type of the receiver?,
 The maximum value for an int type in Go,
 Do Sets exist in Go? (like in Python),
 Usage of interface in Go,
 How do I retrieve file data over a socket in Go?,
 How do you create a new instance of a struct from it's Type at runtime in Go?,
 Is it possible to make extensions to python/php/perl with Go?]

In [52]:
df_tags = pd.read_csv("Tags.csv")
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
             return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                 .loc[lambda d: ~d['Id'].isin(go_ids)]
                 .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                 ['Title']
                 .tolist())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1167, 762, 1345)

In [47]:
spacy.explain("pobj")

'object of preposition'

```
python -m spacy download en_core_web_sm
```

In [48]:




type(nlp("My name is Kamran."))
[_ for _ in nlp("My name is Kamran.")]

doc = nlp("My name is Kamran. WORD UP MAN!")
displacy.render(doc)

spacy.explain("pobj")

#save the output of the following program to the var "ex1"

ex1 = [(t.text, t.pos_, t.dep_) for t in doc]

ex1

[('My', 'PRON', 'poss'),
 ('name', 'NOUN', 'nsubj'),
 ('is', 'AUX', 'ROOT'),
 ('Kamran', 'PROPN', 'attr'),
 ('.', 'PUNCT', 'punct'),
 ('WORD', 'PROPN', 'ROOT'),
 ('UP', 'ADP', 'prt'),
 ('MAN', 'PROPN', 'dobj'),
 ('!', 'PUNCT', 'punct')]