## This notebook is for following along (and modifying and testing) the code as shown in the book "Blueprints for Text Analytics Using Python"

## Chapter 6

In [1]:
# Re-running Java example for data
import pandas as pd
import numpy as np
df = pd.read_csv('eclipse_jdt.csv')

# Step 1 - data prep
df = df[['Title', 'Description', 'Component']]
df = df.dropna()
df['text'] = df['Title'] + ' ' + df['Description']
df = df.drop(columns=['Title', 'Description'])

# This is a clean function from Chapter 4. Typing it here to use 
import html
import re

def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text) 
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['text'] = df['text'].apply(clean)
df = df[df['text'].str.len() > 50]


df = df.groupby('Component', as_index=False).apply(pd.DataFrame.sample,random_state=21,frac=.2)

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Component'], test_size=0.2,
                                                   random_state=42, stratify=df['Component'])
print('Size of Training Data, Test Data', X_train.shape[0], X_test.shape[0])

Size of Training Data, Test Data 7208 1803


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=10, ngram_range=(1,2), stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)

In [4]:
from sklearn.svm import SVC
svc = SVC(kernel="linear", C=1, probability=True, random_state=42)
svc.fit(X_train_tf, y_train)

SVC(C=1, kernel='linear', probability=True, random_state=42)

In [5]:
X_test_tf = tfidf.transform(X_test)
y_pred = svc.predict(X_test_tf)
result = pd.DataFrame({ 'text': X_test.values, 'actual': y_test.values, 'predicted': y_pred})

In [19]:
result[result["actual"] != result["predicted"]][result["text"].str.contains("left")]

  result[result["actual"] != result["predicted"]][result["text"].str.contains("left")]


Unnamed: 0,text,actual,predicted
201,bracket highlighting should also work if caret...,Text,UI
287,No content assist between left paren and strin...,UI,Text
321,one cant navigate into an interface method wit...,Core,UI
678,gotoMarker does not set source range and does ...,Text,UI
715,Pasting converts tab to spaces Just a minor fl...,UI,Text
734,MalformedTreeException when formatting edited ...,Core,UI
828,Double-click to set a breakpoing results in th...,Debug,Text
1113,Zombie targets left by CommandArgumentTests Ru...,Debug,UI
1149,IndexOutOfBoundException on code complete 3.3M...,Text,UI
1267,Code-assist doesnt properly indent when overri...,UI,Text


In [20]:
text = result.iloc[287]["text"]
print(text)

No content assist between left paren and string literal Build 20020307; ; 1. Create the following cu:; public class A {; void foo(String s) {; }; void bar() {; this.foo();; }; }; 2. Position the cursor as indicated below:; this.foo( );; 3. Crtl-Space; ; You only get a beep.; ; If you remove the string literal; i.e.; this.foo( );; then you get a proposal.


In [28]:
print(svc.predict_proba(X_test_tf[287]))

[[0.00251425 0.31384306 0.00241781 0.00082898 0.59485614 0.08553977]]


In [32]:
class_names = ["APT", "Core", "Debug", "Doc", "Text", "UI"]
prob = svc.predict_proba(X_test_tf)
# new dataframe for explainable results
er = result.copy().reset_index()

In [38]:
for c,i in enumerate(class_names):
    er[i] = prob[:,c]

In [39]:
er[["actual", "predicted"] + class_names].sample(5, random_state=99)

Unnamed: 0,actual,predicted,APT,Core,Debug,Doc,Text,UI
1064,Core,Core,0.000485,0.98176,0.000663,0.000493,0.003891,0.012708
1317,Core,Text,0.002533,0.222902,0.01349,0.001237,0.635865,0.123973
1246,Core,Core,0.006046,0.696135,0.04191,0.004596,0.024863,0.22645
163,UI,UI,0.00584,0.165487,0.119905,0.003579,0.151754,0.553435
1089,Core,Core,0.001534,0.917918,0.021709,0.000619,0.009508,0.048712
