In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
%matplotlib inline

In [2]:
with open('Manual WorkItems alltypes Security.csv', encoding='ascii', errors='ignore') as infile:
    dataset1 = pd.read_csv(infile,names = ['Title', 'Security', 'SourceLink'])
with open('Manual WorkItems alltypes nonSecurity.csv', encoding='ascii', errors='ignore') as infile:
    dataset2 = pd.read_csv(infile,names = ['Title', 'Security', 'SourceLink'])

In [3]:
dataset1 = dataset1.sample(frac = 0.45, random_state= 123).reset_index(drop = True)
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63262 entries, 0 to 63261
Data columns (total 3 columns):
Title         63261 non-null object
Security      63262 non-null object
SourceLink    63262 non-null object
dtypes: object(3)
memory usage: 1.4+ MB


In [4]:
dataset1['IsSecured'] = 1
dataset2['IsSecured'] = 0

In [5]:
dataset = pd.concat([dataset1, dataset2]).reset_index(drop = True)
dataset.head()

Unnamed: 0,Title,Security,SourceLink,IsSecured
0,[WDGISScan] Perf.Service.PASAPI 'pasdb.westus....,MS.Security,https://microsoft.visualstudio.com/DefaultColl...,1
1,[Studio]Form shouldn't load another datasource...,MS.Security,https://msazure.visualstudio.com/DefaultCollec...,1
2,Glusterfs should only allow certain ip range t...,MS.Security,https://msasg.visualstudio.com/DefaultCollecti...,1
3,[dev15->devmain] Fuzzing: Mac Word: doc: Proba...,MS.Security,http://bugcheck/bugs/OfficeMain/75542,1
4,(Port to wacserver 16) MSRC 32783. Word Crash ...,MS.Security,https://office.visualstudio.com/DefaultCollect...,1


In [6]:
dataset = dataset.sample(frac = 1).reset_index(drop = True)
dataset.head()

Unnamed: 0,Title,Security,SourceLink,IsSecured
0,"[Footer] Too many links for min-width, footer ...",MS.Security,https://msasg.visualstudio.com/DefaultCollecti...,1
1,Source Depot: the Protect table needs to be cl...,MS.Security,http://sqlbuvsts01:8080/Main/SQL Server/_worki...,1
2,DCT: GTE to Baltimore Cert rollover,MS.Security,https://identitydivision.visualstudio.com/Defa...,1
3,Stage 22: D365 ENT Apps-Sales [Param Kahlon],MS.Privacy,https://msazure.visualstudio.com/DefaultCollec...,0
4,[Accessibility - Screen Reader - Access Policy...,MS.Accessibility,https://msazure.visualstudio.com/DefaultCollec...,0


In [7]:
dataset.dropna(axis=0, how='any', inplace = True)

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122058 entries, 0 to 122059
Data columns (total 4 columns):
Title         122058 non-null object
Security      122058 non-null object
SourceLink    122058 non-null object
IsSecured     122058 non-null int64
dtypes: int64(1), object(3)
memory usage: 4.7+ MB


In [9]:
from nltk.corpus import stopwords
my_stopwords = stopwords.words('english')
type(my_stopwords)

list

In [10]:
from patsy import dmatrices
import statsmodels.discrete.discrete_model as sm1
from statsmodels.formula.api import logit, ols, poisson, probit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
import os, sys
def splitall(path):
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return " ".join(allparts)

In [12]:
#I will now apply the splitall function on the dataset.
dataset['Parsed Title'] = dataset['Title'].apply(splitall)
#dataset.head(100)

In [13]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def clean_text(text):
    text = "".join([char if char.isalpha() else " " for char in text.lower()]).split()
    text = [ps.stem(word) for word in text]
    text = " ".join([word for word in text if (word not in my_stopwords and len(word) > 2)])
    return text.lower()

In [14]:
%time dataset['Clean Title'] = dataset['Parsed Title'].apply(clean_text)

Wall time: 40.1 s


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectPercentile, f_classif
X, y = dataset['Clean Title'], dataset['IsSecured']
tv = TfidfVectorizer(min_df=0.001, max_df= 1.0, use_idf=True,stop_words = 'english' )
#tv = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = 'english')
#selector = SelectPercentile(f_classif, percentile = 10)
#selector.fit(tv.fit_transform(X))
X_train, X1, y_train, y1 = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X1, y1, test_size=0.5, random_state=42)
X_train = tv.fit_transform(X_train)#.toarray()
X_test = tv.transform(X_test)#.toarray()
X_val = tv.transform(X_val)#.toarray()
vocab = tv.get_feature_names()
# y_test = np.array(list(y_test))
# y_train = np.array(list(y_train))
# y_val = np.array(list(y_val))
print(len(vocab))


1098


In [16]:
X2 = pd.DataFrame(X_train.toarray(), columns = vocab)
X2["Intercept"] = 1.0

X3 = pd.DataFrame(X_val.toarray(), columns = vocab)
X3["Intercept"] = 1.0

X4 = pd.DataFrame(X_test.toarray(), columns = vocab)
X4["Intercept"] = 1.0

vocab = vocab +['Intercept']

In [19]:
X2[vocab]=X2[vocab].astype('float32')
X2[vocab]=X2[vocab].round(2)

In [20]:
X2["IsSecured"] = list(y_train)

In [21]:
from pandas.core import datetools
import statsmodels.api as sm

  """Entry point for launching an IPython kernel.


In [22]:
%time logit = sm.Logit(X2['IsSecured'], X2[vocab])

Wall time: 21.3 s


In [23]:
# fit the model
%time result = logit.fit(method='bfgs',maxiter=500)

Optimization terminated successfully.
         Current function value: 0.183303
         Iterations: 485
         Function evaluations: 487
         Gradient evaluations: 487
Wall time: 13min 8s


In [24]:
# cool enough to deserve it's own gist
result.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.735
Dependent Variable:,IsSecured,AIC:,29038.0874
Date:,2018-04-20 18:18,BIC:,39113.6369
No. Observations:,73234,Log-Likelihood:,-13424.0
Df Model:,1094,LL-Null:,-50708.0
Df Residuals:,72139,Scale:,1.0
Converged:,1.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
aad,-0.1144,0.5117,-0.2235,0.8232,-1.1174,0.8886
abil,0.2404,0.6060,0.3968,0.6915,-0.9472,1.4281
abl,0.2036,0.4615,0.4413,0.6590,-0.7009,1.1081
acc,-7.8177,1.6855,-4.6381,0.0000,-11.1213,-4.5141
accept,-1.3065,0.5674,-2.3028,0.0213,-2.4186,-0.1945
access,-7.2816,0.2497,-29.1642,0.0000,-7.7710,-6.7923
account,1.1146,0.3080,3.6182,0.0003,0.5108,1.7183
aci,2.2137,1.2453,1.7776,0.0755,-0.2271,4.6545
action,-1.5036,0.4342,-3.4631,0.0005,-2.3546,-0.6526


In [25]:
#from pandas_ml import ConfusionMatrix
from sklearn.metrics import confusion_matrix

predict=result.predict(X3)

Y_val_predict=[]
for i in predict:
    if(i>=0.5):
        Y_val_predict.append(1)
    else:
        Y_val_predict.append(0)

conf=confusion_matrix(y_val,Y_val_predict)
#plt.imshow(conf, cmap='binary', interpolation='None')
print(conf)
#plt.show()

[[10884  1002]
 [  807 11719]]


In [26]:
print("odds ratio's")
print(np.exp(result.params))

odds ratio's
aad          8.919384e-01
abil         1.271809e+00
abl          1.225868e+00
acc          4.025554e-04
accept       2.707527e-01
access       6.880792e-04
account      3.048255e+00
aci          9.149569e+00
action       2.223210e-01
activ        3.345558e-01
adapt        2.915211e-02
add          6.918260e-01
addit        1.791728e-01
address      6.123549e-01
adjust       3.652239e+00
admin        1.938235e+00
administr    2.263818e+00
adminport    2.845795e+07
advanc       5.647828e+00
afd          1.244665e+07
aft          2.644787e+05
agent        4.815121e-01
aip          2.158200e-01
album        3.056610e-02
alert        2.198083e+00
align        1.203155e+00
allow        7.704642e+00
alreadi      4.087895e+00
alt          4.155968e-04
alway        9.902101e-01
                 ...     
warn         5.130746e+00
way          3.066607e-01
wdg          3.081821e+00
wdgisscan    2.369619e+06
weak         2.899475e+04
web          2.915668e-01
webauth      2.773199e+07

In [27]:
from sklearn import metrics    
print(metrics.classification_report(y_val, Y_val_predict,target_names=['class1','class0']))
score = metrics.accuracy_score(y_val, Y_val_predict)
print("accuracy:   %0.3f" % score)

f1_score = metrics.f1_score(y_val, Y_val_predict)
print("f1_score:   %0.3f" % f1_score)
    
precision_score = metrics.precision_score(y_val, Y_val_predict)
print("precision_score:   %0.3f" % precision_score)
    
recall_score = metrics.recall_score(y_val, Y_val_predict)
print("recall_score:   %0.3f" % recall_score)

             precision    recall  f1-score   support

     class1       0.93      0.92      0.92     11886
     class0       0.92      0.94      0.93     12526

avg / total       0.93      0.93      0.93     24412

accuracy:   0.926
f1_score:   0.928
precision_score:   0.921
recall_score:   0.936


In [28]:
#standard error of coefficient can be retrieved by
result.bse

aad          0.511741
abil         0.605963
abl          0.461488
acc          1.685549
accept       0.567382
access       0.249676
account      0.308042
aci          1.245328
action       0.434187
activ        0.395777
adapt        0.808185
add          0.224160
addit        0.594940
address      0.465529
adjust       0.685319
admin        0.399433
administr    0.866915
adminport    7.900698
advanc       0.699489
afd          7.128431
aft          7.245270
agent        0.440673
aip          1.008380
album        1.469434
alert        0.363283
align        0.516496
allow        0.473882
alreadi      0.862416
alt          1.147586
alway        0.618731
               ...   
warn         0.679764
way          0.581555
wdg          1.567008
wdgisscan    6.845437
weak         3.488366
web          0.344992
webauth      2.701766
websit       0.885584
week         0.844456
welcom       0.734823
white        0.735277
whitelist    0.625317
win          0.356086
window       0.314197
wizard    