In [27]:
import spacy
import pandas as pd
from spacy import displacy

In [15]:
nlp = spacy.load("en_core_web_sm")

In [5]:
df=(pd.read_csv('/home/iamauser/Downloads/stack_overflow_ds/archive/Questions.csv', 
                nrows=1000000, encoding='ISO-8859-1', usecols=['Title','Id']))
titles=[_ for _ in df['Title']]

In [6]:
#level1 simple in_go functionn

def has_golang(text):
    return "go" in text

g  = (title for title in titles if has_golang(title))

In [7]:
[next(g) for i in range(4)]

['My website got hacked... What should I do?',
 "DVCS Choices - What's good for Windows?",
 'Is a "Confirm Email" input good practice when user changes email address?',
 'Any good advice on using emacs for C++ project?']

### Shortcomings
1. It gives all instances of `go` which more often than not are verbs or undesirable elements.

### Steps to overcome
0. get all hits of go 
1. Use nlp model provided to us by spacy and avoid getting verbs.

In [22]:
titles_gen1=[_ for _ in df[df['Title'].str.lower().str.contains('go')]['Title']]

In [44]:
%%time
def has_go_not_verb(doc):
    for token in doc:
        if token.lower_ in ["go", "golang"]:
            if token.pos_!="VERB":
                if token.dep_=="pobj":
                    return True
    return False

g = (doc for doc in nlp.pipe(titles_gen1) if has_go_not_verb(doc))
[next(g) for _ in range(10)]


CPU times: user 3.39 s, sys: 370 µs, total: 3.39 s
Wall time: 3.39 s


[Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go,
 Trouble reading from a socket in go,
 What's the simplest way to edit conflicted files in one go when using git and an editor like Vim or textmate?,
 Convert string to integer type in Go?,
 Is there any automated conversion from Go to Python?,
 Implementing the âdeferâ statement from Go in Objective-C?]

In [36]:
displacy.render(nlp('How to Create a Dropdown List Hyperlink without the GO button?'))

In [58]:
#dep_= pobj
spacy.explain('pobj')
def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go','golang']:
            return True
    return False

In [50]:
df_tags = pd.read_csv('/home/iamauser/Downloads/stack_overflow_ds/archive/Tags.csv')

In [51]:
print('Tags',df_tags.columns)
print('Title', df.columns)

Tags Index(['Id', 'Tag'], dtype='object')
Title Index(['Id', 'Title'], dtype='object')


In [59]:
%%time
go_ids = df_tags[lambda d:d['Tag']=='go']['Id']
all_go_sentence_array = df.loc[lambda d:d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentence_array) if has_go_token(d)]


CPU times: user 1.13 s, sys: 7.97 ms, total: 1.14 s
Wall time: 1.14 s


In [61]:
non_detectable =(df
                .loc[lambda d: ~d['Id'].isin(go_ids)]
                .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                ['Title']
                .tolist())
non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]                

In [65]:
print("All go related titles:%s\nAll go related titles that has word 'go':%s\nAll the non go related titles that has go in them:%s"%(len(all_go_sentence_array), len(detectable), len(non_detectable)))

All go related titles:1167
All go related titles that has word 'go':762
All the non go related titles that has go in them:1345


In [78]:
def has_go_lang_2(doc):
    for token in doc:
        if token.lower_ in ["go", "golang"]:
            if token.pos_!="VERB":
                #if token.dep_=="pobj":
                 return True
    return False


In [80]:
correct = sum(has_go_lang_2(doc) for doc in nlp.pipe(detectable))
wrong = sum(has_go_lang_2(doc) for doc in nlp.pipe(non_detectable))

In [77]:
### benchmarking
precision = correct/(correct+wrong)
recall = correct/len(detectable)
accuracy = (correct+len(non_detectable)-wrong)/(len(detectable)+len(non_detectable))
print('accuracy:%s, recall:%s, precision:%s' % (accuracy, recall, precision)) 


accuracy:0.7593735168485999, recall:0.3543307086614173, precision:0.9473684210526315


In [81]:
precision = correct/(correct+wrong)
recall = correct/len(detectable)
accuracy = (correct+len(non_detectable)-wrong)/(len(detectable)+len(non_detectable))
print('accuracy:%s, recall:%s, precision:%s' % (accuracy, recall, precision)) 


accuracy:0.8713811105837684, recall:0.7139107611548556, precision:0.9112227805695142
