#           Recognizing a programming Language as an entity using spacy:

## 1. First steps with spacy :

####  1.1. Importing the necessary libraries :



In [112]:
import pandas as pd
import spacy 
from spacy import displacy
from spacy.matcher import Matcher
from IPython.display import HTML as html_print
from sklearn.metrics import confusion_matrix, classification_report

####   1.2. Loading the data :

 * We are not going to use all the dataset we are using only 1000000 rows and two features which are the Title and the ID :

In [38]:
df = (pd.read_csv("C:/Users/khaoula/Desktop/spacy-project/Questions.csv", nrows=1_000_000, 
                  encoding="ISO-8859-1", usecols=['Title', 'Id']))
titles = [_ for _ in df['Title']]

####  1.3. Working without spacy :

* We are going to try to return all phrases containing the word go as a programming language:

In [39]:
def has_golang(text):
    return "go" in text

g = (title for title in titles if has_golang(title))
[next(g) for i in range(2)]

['My website got hacked... What should I do?',
 "DVCS Choices - What's good for Windows?"]

* The code below returned all phrases containing words containing go like got and good,and that's not what we wanted.That's why we are going to use spacy now.

#### 1.4. Working with spacy :

* First we are starting by loading the English language model :

In [40]:
nlp = spacy.load("en_core_web_sm")

Let's first see the difference between working with spacy and without spacy.

In [41]:
[t for t in nlp("My name is khaoula benali.")]

[My, name, is, khaoula, benali, .]

* The difference now is that every word in the phrase became an isolated member (token) of a set so now we can work on each word aside very easily.

In [42]:
doc = nlp("My name is khaoula benali.")

In [43]:
t = doc[3]
print(t)

khaoula


In [44]:
type(t)

spacy.tokens.token.Token

In [45]:
displacy.render(doc)

* As you can see it is able to detect at most of the times the nature of each word: noun,verb..

In [46]:
for t in nlp("Where does Console.WriteLine go in ASP.NET?"):
    print(t, t.pos_, t.dep_)

Where ADV advmod
does VERB ROOT
Console PROPN nsubj
. PUNCT punct
WriteLine PROPN nsubj
go VERB ROOT
in ADP prep
ASP.NET PROPN pobj
? PUNCT punct


#### 1.5. Detecting the word Go/GoLang (Programming Language) :

* Now we are going to work with a bigger number of rows to detect the programming language go but we are going to use only the feature title to go faster.

In [47]:
df = (pd.read_csv("C:/Users/khaoula/Desktop/spacy-project/Questions.csv", nrows=2_000_000, 
                  encoding="ISO-8859-1", usecols=['Title', 'Id']))

titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]

* We are going to disable en_core_web_sm ner also to make the model faster :

In [48]:
nlp = spacy.load("en_core_web_sm", disable=["ner"])

* We are going to search fo the words go or golang that are nouns in all the titles and we will use also nlp.pipe for a better performance :

In [49]:
%time
def has_golang(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ == "NOUN":
                return True 
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(30)]

Wall time: 0 ns


[Removing all event handlers in one go,
 multi package makefile example for go,
 Trouble reading from a socket in go,
 SOAPUI & Groovy Scripts, executing multiple SQL statements in one go,
 What's the simplest way to edit conflicted files in one go when using git and an editor like Vim or textmate?,
 what's the state of go language IDE support?,
 making generic algorithms in go,
 How do I allocate memory for an array in the go programming language?,
 What's wrong with the following go code that I receive 'all goroutines are asleep - deadlock!',
 Pass variables into Thread on the go?,
 If two options are the same go to next option in second select,
 C macro define many variables in one go,
 How to use LevelDB in go?,
 Change EOL on multiple files in one go,
 Getting CPU usage with golang,
 Auto-complete with go-mode,
 How do I use an custom int type with range in go?,
 How to generate IObservable index delta on the go?,
 Pylint ignores disable-msg - but only dependent on other modules l

#### 1.6. Benchmarking:

* Loading the Tags Data and stocking the IDs of rows having go as a Tag:

In [50]:
df_tags = pd.read_csv("C:/Users/khaoula/Desktop/spacy-project/Tags.csv")
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

* preparing the detecting function :

In [51]:
def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                return True
    return False

* We are not going to detect phrases containing go as a token we are only going to detect phrases that contain go as a Tag and it's not a verb at the same time:

In [52]:
all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                  .loc[lambda d: ~d['Id'].isin(go_ids)]
                  .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                  ['Title']
                  .tolist())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1858, 874, 73)

* Now we are going to calculate precision ,recall and accuracy to see how well the function performs:

In [53]:
method = "not-verb-but-pobj"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct/(correct + wrong)
recall = correct/len(detectable)
accuracy = (correct + len(non_detectable) - wrong)/(len(detectable) + len(non_detectable))

f"{precision},{recall},{accuracy},{model_name},{method}" # this is logged

'0.9229144667370645,1.0,0.9229144667370645,en_core_web_sm,not-verb-but-pobj'

## 2. Detecting programming languages (Rule Based Matching) :

In [78]:
doc = nlp("i am an iOS dev and I like to code in objective-c")

In [79]:
[t for t in doc]

[i, am, an, iOS, dev, and, I, like, to, code, in, objective, -, c]

* Objective-c is detected as 3 separate tokens because of '-' between objective and c.That's why we are going to use matchers:

####  2.1. Detecting Programming languages using  a matcher :  

In [80]:
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'},
                  {'LOWER': 'c'}]

obj_c_pattern2 = [{'LOWER': 'objectivec'}]

golang_pattern1 = [{'LOWER': 'golang'}] 
golang_pattern2 = [{'LOWER': 'go', 
                    'POS': {'NOT_IN': ['VERB']}}]

python_pattern = [{'LOWER': 'python'}]
ruby_pattern   = [{'LOWER': 'ruby'}]
js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

In [83]:
matcher = Matcher(nlp.vocab)

matcher.add('GO_LANG', [golang_pattern1, golang_pattern2])
matcher.add("OBJ_C_LANG", [ obj_c_pattern1, obj_c_pattern2])
matcher.add("PYTHON_LANG",  [python_pattern])
matcher.add("JS_LANG",  [js_pattern])
matcher.add("RUBY_LANG",[ruby_pattern])

* And now let's test and see the results :

In [87]:
doc = nlp("I am an iOS dev who codes in both python, go/golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

python
golang
objective-c


In [88]:
doc = nlp("I've done some js and ruby and go programming")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

js
ruby


In [89]:
[(t,t.pos_) for t in doc]

[(I, 'PRON'),
 ('ve, 'AUX'),
 (done, 'VERB'),
 (some, 'DET'),
 (js, 'ADJ'),
 (and, 'CCONJ'),
 (ruby, 'NOUN'),
 (and, 'CCONJ'),
 (go, 'VERB'),
 (programming, 'VERB')]

* Now we are detecting most languages but we are not detecting go because it's recognized as a verb .

####  2.1. Benchmarking :

In [90]:
titles = (_ for _ in df['Title'] if "python" in _.lower())

In [91]:
for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) == 0:
        print(doc)

mod_python/MySQL error on INSERT with a lot of data: "OperationalError: (2006, 'MySQL server has gone away')"
Running subversion under apache and mod_python
What's the best way to embed IronPython inside my C# App?
How to set the PYTHONPATH in Emacs?
wxPython wxDC object from win32gui.GetDC
Need skeleton code to call Excel VBA from PythonWin
Questions for python->scheme conversion
wxPython and sharing objects between windows
Django on IronPython
IronPython Webframework
A SuggestBox for wxPython?
Intercepting Method Access on the Host Program of IronPython
Is there anything like IPython / IRB for Perl?


## 3. Evaluating our approach :

####  3.2. Evaluating :

* Let's highlight the detected tokens  using IPython.display :

In [93]:
def style(s, bold=False):
    blob = f"<text>{s}</text>"
    if bold:
        blob = f"<b style='background-color: #fff59d'>{blob}</b>"
    return blob

def html_generator(g, n=10):
    blob = ""
    for i in range(n):
        doc = next(g)

        state = [[t, False] for t in doc]
        for idx, start, end in matcher(doc):
            for i in range(start, end):
                state[i][1] = True
        blob += style(' '.join([style(str(t[0]), bold=t[1]) for t in state]) + '<br>') 
    return blob

In [95]:
titles = (_ for _ in df['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d))>0)
html_print(html_generator(g, n=10))

* Now we can see clearly what we heve detected but the problem is there still a lot of languages that are not detected just like c#.That's why we are going to make a bigger number of patterns in order to pick up all programming languages:

In [96]:
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'},
                  {'LOWER': 'c'}]
obj_c_pattern2 = [{'LOWER': 'objectivec'}]

csharp_pattern1 = [{'LOWER': 'c'}, {'LOWER': '#'}]
csharp_pattern2 = [{'LOWER': 'c'}, {'LOWER': 'sharp'}]
csharp_pattern3 = [{'LOWER': 'c#'}]

fsharp_pattern1 = [{'LOWER': 'f'}, {'LOWER': '#'}]
fsharp_pattern2 = [{'LOWER': 'f'}, {'LOWER': 'sharp'}]
fsharp_pattern3 = [{'LOWER': 'f#'}]
 
dot_net_pattern = [{'LOWER': '.net'}]

php_pattern = [{'LOWER': 'php'}]

asp_net_pattern = [{'LOWER': {'IN': ['asp.net', 'asp']}}]

python_pattern = [{'LOWER': 'python'}]

lisp_pattern1  = [{'LOWER': 'lisp'}]
lisp_pattern2  = [{'LOWER': 'common'}, {'LOWER': 'lisp'}]

go_pattern1    = [{'LOWER': 'go', 'POS': {'NOT_IN': ['VERB']}}]
go_pattern2    = [{'LOWER': 'golang'}]

ruby_pattern   = [{'LOWER': 'ruby'}]

sql_pattern    = [{'LOWER': 'sql'}]

matlab_pattern = [{'LOWER': 'matlab'}]

perl_pattern   = [{'LOWER': 'perl'}]

html_pattern   = [{'LOWER': 'html'}]

css_pattern   = [{'LOWER': 'css'}]

js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

java_pattern   = [{'LOWER': 'java'}]

c_pattern      = [{'LOWER': 'c'}]

cpp_pattern    = [{'LOWER': 'c++'}]

In [97]:
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("OBJ_C_LANG",[obj_c_pattern1, obj_c_pattern2])
matcher.add("PYTHON_LANG", [python_pattern])
matcher.add("GO_LANG", [go_pattern1, go_pattern2])
matcher.add("CSHARP_LANG",[csharp_pattern1, csharp_pattern2, csharp_pattern3])
matcher.add("FSHARP_LANG", [fsharp_pattern1, fsharp_pattern2, fsharp_pattern3])
matcher.add("JS_LANG",[js_pattern])
matcher.add("JAVA_LANG",[java_pattern])
matcher.add("RUBY_LANG",[ruby_pattern])
matcher.add("SQL_LANG", [sql_pattern])
matcher.add("C_LANG",[c_pattern])
matcher.add("CPP_LANG", [cpp_pattern])
matcher.add("PHP_LANG", [php_pattern])
matcher.add("MATLAB_LANG", [matlab_pattern])
matcher.add("PERL_LANG", [perl_pattern])
matcher.add("LISP_LANG", [lisp_pattern1, lisp_pattern2])
matcher.add("HTML_LANG", [html_pattern])
matcher.add("CSS_LANG", [css_pattern])

In [100]:
titles = (_ for _ in df['Title'][:2000])
sum(1 for d in nlp.pipe(titles) if len(matcher(d))>0)
html_print(html_generator(g, n=10))

####  3.2. Language Version :

* Let's make a function to deal with versions. It creates many patterns out of a single pattern:

In [101]:
def create_versioned(name):
    return [
        [{'LOWER': name}], 
        [{'LOWER': {'REGEX': f'({name}\d+\.?\d*.?\d*)'}}], 
        [{'LOWER': name}, {'TEXT': {'REGEX': '(\d+\.?\d*.?\d*)'}}],
    ]

create_versioned('python')

[[{'LOWER': 'python'}],
 [{'LOWER': {'REGEX': '(python\\d+\\.?\\d*.?\\d*)'}}],
 [{'LOWER': 'python'}, {'TEXT': {'REGEX': '(\\d+\\.?\\d*.?\\d*)'}}]]

In [103]:
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("PYTHON_LANG",  [*create_versioned('python')])
g = nlp.pipe(["i use python, python3.7, python 3.6.6", 
              "also python3, python 2 and python3.2.1", 
              "not bypython, pythonn and py36"])
html_print(html_generator(g, n=3))