In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
%matplotlib inline

In [2]:
with open('Manual WorkItems alltypes Security.csv', encoding='ascii', errors='ignore') as infile:
    dataset1 = pd.read_csv(infile,names = ['Title', 'Security', 'SourceLink'])
with open('Manual WorkItems alltypes nonSecurity.csv', encoding='ascii', errors='ignore') as infile:
    dataset2 = pd.read_csv(infile,names = ['Title', 'Security', 'SourceLink'])

In [3]:
dataset1 = dataset1.sample(frac = 0.45, random_state= 123).reset_index(drop = True)
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63262 entries, 0 to 63261
Data columns (total 3 columns):
Title         63261 non-null object
Security      63262 non-null object
SourceLink    63262 non-null object
dtypes: object(3)
memory usage: 1.4+ MB


In [4]:
dataset1['IsSecured'] = 1
dataset2['IsSecured'] = 0

In [5]:
dataset = pd.concat([dataset1, dataset2]).reset_index(drop = True)
dataset.head()

Unnamed: 0,Title,Security,SourceLink,IsSecured
0,[WDGISScan] Perf.Service.PASAPI 'pasdb.westus....,MS.Security,https://microsoft.visualstudio.com/DefaultColl...,1
1,[Studio]Form shouldn't load another datasource...,MS.Security,https://msazure.visualstudio.com/DefaultCollec...,1
2,Glusterfs should only allow certain ip range t...,MS.Security,https://msasg.visualstudio.com/DefaultCollecti...,1
3,[dev15->devmain] Fuzzing: Mac Word: doc: Proba...,MS.Security,http://bugcheck/bugs/OfficeMain/75542,1
4,(Port to wacserver 16) MSRC 32783. Word Crash ...,MS.Security,https://office.visualstudio.com/DefaultCollect...,1


In [6]:
dataset = dataset.sample(frac = 1).reset_index(drop = True)
dataset.head()

Unnamed: 0,Title,Security,SourceLink,IsSecured
0,"[ComboBox] The string ""Find Items"" of ComboBox...",MS.Security,https://msazure.visualstudio.com/DefaultCollec...,1
1,TVS: Warning 6262 in file: Xbox\base\net\core\...,MS.Security,https://microsoft.visualstudio.com/DefaultColl...,1
2,Investigate increase in S2S token duration,MS.Security,https://identitydivision.visualstudio.com/Defa...,1
3,[AWS] - Azure Email Service client certificate...,MS.Security,https://identitydivision.visualstudio.com/Defa...,1
4,[Screen Reader - Key Vault -Create] Narrator d...,MS.Accessibility,https://msazure.visualstudio.com/DefaultCollec...,0


In [7]:
dataset.dropna(axis=0, how='any', inplace = True)

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122058 entries, 0 to 122059
Data columns (total 4 columns):
Title         122058 non-null object
Security      122058 non-null object
SourceLink    122058 non-null object
IsSecured     122058 non-null int64
dtypes: int64(1), object(3)
memory usage: 4.7+ MB


In [9]:
from nltk.corpus import stopwords
my_stopwords = stopwords.words('english')
type(my_stopwords)

list

In [10]:
from patsy import dmatrices
import statsmodels.discrete.discrete_model as sm1
from statsmodels.formula.api import logit, ols, poisson, probit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
import os, sys
def splitall(path):
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return " ".join(allparts)

In [12]:
#I will now apply the splitall function on the dataset.
dataset['Parsed Title'] = dataset['Title'].apply(splitall)
#dataset.head(100)

In [13]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def clean_text(text):
    text = "".join([char if char.isalpha() else " " for char in text.lower()]).split()
    text = [ps.stem(word) for word in text]
    text = " ".join([word for word in text if (word not in my_stopwords and len(word) > 2)])
    return text.lower()

In [14]:
%time dataset['Clean Title'] = dataset['Parsed Title'].apply(clean_text)

Wall time: 42.5 s


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectPercentile, f_classif
X, y = dataset['Clean Title'], dataset['IsSecured']
tv = TfidfVectorizer(min_df=0.001, max_df= 1.0, use_idf=True,stop_words = 'english' )
#tv = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english')
#selector = SelectPercentile(f_classif, percentile = 10)
#selector.fit(tv.fit_transform(X))
X_train, X1, y_train, y1 = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X1, y1, test_size=0.5, random_state=42)
X_train = tv.fit_transform(X_train)#.toarray()
X_test = tv.transform(X_test)#.toarray()
X_val = tv.transform(X_val)#.toarray()
vocab = tv.get_feature_names()
# y_test = np.array(list(y_test))
# y_train = np.array(list(y_train))
# y_val = np.array(list(y_val))
print(len(vocab))


1105


In [16]:
X2 = pd.DataFrame(X_train.toarray(), columns = vocab)
X2["Intercept"] = 1.0

X3 = pd.DataFrame(X_val.toarray(), columns = vocab)
X3["Intercept"] = 1.0

X4 = pd.DataFrame(X_test.toarray(), columns = vocab)
X4["Intercept"] = 1.0

vocab = vocab +['Intercept']

In [17]:
X2.head()

Unnamed: 0,aad,abil,abl,acc,accept,access,account,aci,action,activ,...,write,wrong,xbox,xml,xss,year,zero,zone,zoom,Intercept
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
X2["IsSecured"] = list(y_train)

In [19]:
from pandas.core import datetools
import statsmodels.api as sm

  """Entry point for launching an IPython kernel.


In [20]:
%time logit = sm.Logit(X2['IsSecured'], X2[vocab])

Wall time: 26.2 s


In [29]:
# fit the model
%time result = logit.fit(method='bfgs',maxiter=500)

Optimization terminated successfully.
         Current function value: 0.182705
         Iterations: 310
         Function evaluations: 312
         Gradient evaluations: 312
Wall time: 1min 45s


In [30]:
# cool enough to deserve it's own gist
result.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.736
Dependent Variable:,IsSecured,AIC:,28972.4179
Date:,2018-04-11 12:39,BIC:,39149.183
No. Observations:,73234,Log-Likelihood:,-13380.0
Df Model:,1105,LL-Null:,-50709.0
Df Residuals:,72128,Scale:,1.0
Converged:,1.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
aad,-0.3460,0.5518,-0.6271,0.5306,-1.4276,0.7355
abil,0.0882,0.6101,0.1446,0.8850,-1.1076,1.2840
abl,0.4301,0.4736,0.9083,0.3637,-0.4980,1.3583
acc,-6.8093,1.5686,-4.3411,0.0000,-9.8836,-3.7350
accept,-1.3926,0.5843,-2.3833,0.0172,-2.5379,-0.2474
access,-7.4263,0.2475,-30.0015,0.0000,-7.9115,-6.9412
account,1.2000,0.3072,3.9066,0.0001,0.5980,1.8020
aci,2.6631,1.3844,1.9237,0.0544,-0.0503,5.3765
action,-1.1635,0.4163,-2.7949,0.0052,-1.9794,-0.3476


In [31]:
#from pandas_ml import ConfusionMatrix
from sklearn.metrics import confusion_matrix

predict=result.predict(X3)

Y_val_predict=[]
for i in predict:
    if(i>=0.5):
        Y_val_predict.append(1)
    else:
        Y_val_predict.append(0)

conf=confusion_matrix(y_val,Y_val_predict)
#plt.imshow(conf, cmap='binary', interpolation='None')
print(conf)
#plt.show()

[[10875  1013]
 [  816 11708]]


In [34]:
print("odds ratio's")
print(np.exp(result.params))

odds ratio's
aad          7.074788e-01
abil         1.092232e+00
abl          1.537433e+00
acc          1.103442e-03
accept       2.484175e-01
access       5.953769e-04
account      3.320117e+00
aci          1.434086e+01
action       3.123950e-01
activ        3.013900e-01
actual       8.279313e-01
adapt        6.927891e-02
add          8.065923e-01
addit        1.626283e-01
address      1.253707e+00
adjust       2.696119e+00
admin        2.316947e+00
administr    4.934557e+00
adminport    1.613633e+05
advanc       4.742200e+00
afd          3.596198e+04
aft          2.600459e+04
agent        4.518969e-01
aip          9.989775e-01
album        1.915147e-02
alert        1.747724e+00
align        1.057869e+00
allow        5.116225e+00
alreadi      3.246576e+00
alt          2.940691e-04
                 ...     
way          1.233092e-01
wdg          5.131787e-01
wdgisscan    5.150152e+03
weak         1.244489e+03
web          3.912756e-01
webauth      6.555027e+07
websit       8.474559e-02

In [37]:
str("X")+str(range(0,10))

'Xrange(0, 10)'