Import the libraries needed to manipulate data

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pyodbc
plt.style.use('ggplot')
%matplotlib inline

connect to the DailyMLResults database and get the data needed for this run

In [2]:
conn = pyodbc.connect(r'Driver={SQL Server};Server=uatdotdatarpt;Database=DailyMLResults;Trusted_Connection=yes;OPTION=3;applicationintent=readonly;')
data = pd.read_sql("""SELECT Title, SourceLink, Modified_IsSecurity, RequirementCollectionDID 

FROM MSSecurity_3 WITH (NOLOCK)

        WHERE AreaPath NOT LIKE '%RD\OneComplianceTest%' 

        AND AreaPath NOT LIKE '%\ClientManagement%'

        AND (IsDataDraft = 0 OR IsDataDraft IS NULL) 

 

UNION

SELECT Title, SourceLink, Modified_IsSecurity, RequirementCollectionDID 

FROM Not_MSSecurity_3 WITH (NOLOCK)

        WHERE AreaPath NOT LIKE '%RD\OneComplianceTest%' 

        AND AreaPath NOT LIKE '%\ClientManagement%'

        AND (IsDataDraft = 0 OR IsDataDraft IS NULL) 

 

UNION

SELECT Title, SourceLink, Modified_IsSecurity, RequirementCollectionDID 

FROM Not_Labeled_3 WITH (NOLOCK)

        WHERE AreaPath NOT LIKE '%RD\OneComplianceTest%' 

        AND AreaPath NOT LIKE '%\ClientManagement%'

        AND (IsDataDraft = 0 OR IsDataDraft IS NULL) """, conn)
conn.close()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15910239 entries, 0 to 15910238
Data columns (total 4 columns):
Title                       object
SourceLink                  object
Modified_IsSecurity         bool
RequirementCollectionDID    object
dtypes: bool(1), object(3)
memory usage: 379.3+ MB


Define functions to clean the data

In [3]:
#Rmoves aldd urls from a title
import re
def clean_review_html(raw_data):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\$|\?|\=|\&|\%)*\b',' ', raw_data, flags=re.MULTILINE)
    letters_only=re.sub('\s+',' ', text)
    return letters_only

#removes all paths from the titles
def clean_review_path(raw_data):
    raw_text = re.sub(r'(\w+):(\\[A-Z0-9a-z_-]*\s*[A-Za-z0-9_-]*)?(\\[A-Za-z0-9_-]*\s*[A-Za-z0-9_-]*)*\b', '', raw_data, flags=re.MULTILINE)
    removed_path = re.sub(r'(\w+):(\[A-Z0-9a-z_-]*\s*[A-Za-z0-9_-]*)?(\[A-Za-z0-9_-]*\s*[A-Za-z0-9_-]*)*\b', '', raw_text, flags=re.MULTILINE)
    return removed_path

#Remove all punctuation marks from a string including underscores.
def remove_punctuation(text):
    return re.sub(r"[\\]", ' ',(' '.join(re.sub(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]", ' ',text).split())))

#remove all words attached to '/' or '\'. this will remove all links that have not been removed
def remove_paths_urls(text):#'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    return ' '.join([word for word in text.split() if '\\' not in word and '/' not in word])

In [4]:
# Split Title at upper case takes IAmTHEBoy you know => I Am THE Boy you know.
def split_on_uppercase(s, keep_contiguous=True):
    """  Args:
        s (str): string
        keep_contiguous (bool): flag to indicate we want to 
                                keep contiguous uppercase chars together
    """

    string_length = len(s)
    is_lower_around = (lambda: s[i-1].islower() or string_length > (i + 1) and s[i + 1].islower())
    start = 0
    parts = []
    for i in range(1, string_length):
        if s[i].isupper() and (not keep_contiguous or is_lower_around()):
            parts.append(s[start: i])
            start = i
    parts.append(s[start:])

    return " ".join(parts)
#print(split_on_uppercase('TheLongANDANdeWindingRoad', True))  

In [5]:
#remove all integers from titles.
def remove_integers(s):
    return re.sub(' \d+ ', ' ', s)
print(remove_punctuation('MSAzureIntune-Svc-IWPortal_Release_1608_CTIP'))
remove_integers(remove_punctuation('MSAzureIntune-Svc-IWPortal_Release_1608_CTIP'))

MSAzureIntune Svc IWPortal Release 1608 CTIP


'MSAzureIntune Svc IWPortal Release CTIP'

Following our discussions with Alok, we had decided to make all instances of PoliCheck to 'policheck'. here, I create a function to do so.

In [6]:
def deal_with_policheck(title):
    return re.sub('(PoliCheck)|(POLICHECK)|(poliCheck)', ' policheck ', title)
deal_with_policheck('''[ DotNet-CoreFx-Trusted_master ][ PoliCheck ] - Defect :''')

'[ DotNet-CoreFx-Trusted_master ][  policheck  ] - Defect :'

In [7]:
data['Title'] = data['Title'].apply(str)

In [8]:
data['Title'] = data['Title'].apply(deal_with_policheck)

In [9]:
data['No Urls'] = data['Title'].apply(clean_review_html)

In [10]:
data['No Paths'] = data['No Urls'].apply(clean_review_path)

In [11]:
data['No Links and Paths'] = data['No Paths'].apply(remove_paths_urls)
#This function is just to ensure that if some paths and urls were not removedby the 
#regular expression, they must be removed now

In [12]:
data['Split Upper'] = data['No Links and Paths'].apply(split_on_uppercase)

In [13]:
data['No Punctuation'] = data['Split Upper'].apply(remove_punctuation)

In [14]:
data['Clean Title'] = data['No Punctuation'].apply(remove_integers)

In [15]:
data.drop(['No Punctuation','Split Upper','No Links and Paths','No Paths','No Urls'],axis=1,inplace=True)

# Model Building¶

We are going to build a bunch of models starting with unigrams and ending with trigrams (3=grams)

We fist import the neccessary libraries, build a tfidf and a model.

We end by validating the model on the validation set and getting output on the test set

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cvectorizer = CountVectorizer(min_df=0.001,max_df=0.85,stop_words='english',ngram_range = (1,2))
cvz = cvectorizer.fit_transform(data['Clean Title'])#.toarray()
vocab = cvectorizer.get_feature_names()
print(len(vocab))

1907


In [30]:
import lda
import pickle

pickle_out=open("cvz_TFIDF_LDA","wb")
pickle.dump(cvz,pickle_out)
pickle_out.close()

pickle_out=open("cvectorizer_TFIDF_LDA","wb")
pickle.dump(cvectorizer,pickle_out)
pickle_out.close()

In [31]:
Lda_model_20 = lda.LDA(n_topics=20, n_iter=500,random_state=42)
X_topics = Lda_model_20.fit_transform(cvz)

INFO:lda:n_documents: 15910239
INFO:lda:vocab_size: 1907
INFO:lda:n_words: 141465808
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -1550719970
INFO:lda:<10> log likelihood: -1046782738
INFO:lda:<20> log likelihood: -911758659
INFO:lda:<30> log likelihood: -888794456
INFO:lda:<40> log likelihood: -882415878
INFO:lda:<50> log likelihood: -879721652
INFO:lda:<60> log likelihood: -878715269
INFO:lda:<70> log likelihood: -878245435
INFO:lda:<80> log likelihood: -877909442
INFO:lda:<90> log likelihood: -877651407
INFO:lda:<100> log likelihood: -877549360
INFO:lda:<110> log likelihood: -877427814
INFO:lda:<120> log likelihood: -877274522
INFO:lda:<130> log likelihood: -877198077
INFO:lda:<140> log likelihood: -877144808
INFO:lda:<150> log likelihood: -877049376
INFO:lda:<160> log likelihood: -877014170
INFO:lda:<170> log likelihood: -876889061
INFO:lda:<180> log likelihood: -876790589
INFO:lda:<190> log likelihood: -876795229
INFO:lda:<200> log likelihood: -876740765

In [32]:
pickle_out=open("Lda_model_TFIDF_20","wb")
pickle.dump(Lda_model_20,pickle_out)
pickle_out.close()

file = open("C:\\Users\\v-prgana\\Desktop\\Scripts_prabhu\\X_topics_whole",'wb')
pickle.dump(X_topics,file)
file.close()

In [19]:
import lda
import pickle

In [22]:
pickle_out=open("Lda_model_TFIDF_20","rb")
Lda_model_TFIDF_20=pickle.load(pickle_out)
pickle_out.close()

In [24]:
X_topics = Lda_model_TFIDF_20.fit_transform(cvz)

In [25]:
file = open("C:\\Users\\v-prgana\\Desktop\\Scripts_prabhu\\X_topics_whole_v1",'wb')
pickle.dump(X_topics,file)
file.close()

In [27]:
n_top_words=15
topic_summaries=[]
topic_word = Lda_model_TFIDF_20.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    #print(i,topic_dist)
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(','.join(topic_words)) 
#topic_summaries

In [None]:
sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(X_topics):
    #print(type(row))
    row=list(row)
    #print(row)
    row = sorted(enumerate(row), key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            topic_keywords = topic_summaries[topic_num]
            sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
        else:
            break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

# Add original text to the end of the output
contents = pd.Series(data['Clean Title'])
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

# Format
df_dominant_topic = sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [46]:
print("Hi")

Hi


In [28]:
X_12, y_12 = data['Clean Title'], data['Modified_IsSecurity']
tv_12 = TfidfVectorizer(min_df = 0.00009, max_df = 0.5, use_idf = True, stop_words = 'english', ngram_range = (1,2))
X_12 = tv_12.fit_transform(X_12)

In [29]:
del Lda_model_TFIDF_20,cvectorizer,cvz

In [30]:
from scipy import sparse
X_t=sparse.csr_matrix(X_topics)

In [31]:
X_t=sparse.hstack((X_12, X_t), format='csr')

In [None]:
#x_df=pd.DataFrame(X_topics)
#x_df.columns=["topic_"+str(i) for i in range(0,20)]

In [32]:
X_train_12, X_12_0, y_train_12, y_12_0 = train_test_split(X_t, y_12, test_size = 0.2, random_state = 42)
X_test_12, X_val_12, y_test_12, y_val_12 = train_test_split(X_12_0, y_12_0, test_size = 0.5, random_state = 42)
vocab_12 = tv_12.get_feature_names()
print(len(vocab_12))

13844


Train a logistic regression model and Get predictions on the training, validation and whole dataset.

In [33]:
model_lr_12 = LogisticRegression(n_jobs = -1)
model_lr_12.fit(X_train_12, y_train_12)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
print('*'*15 + ' Results on Test set ' +'*'*15)
pred_12 = model_lr_12.predict(X_test_12)

score = metrics.accuracy_score(y_test_12, pred_12)
print("accuracy:   %0.3f" % score)

f1_score = metrics.f1_score(y_test_12, pred_12)
print("f1_score:   %0.3f" % f1_score)

precision_score = metrics.precision_score(y_test_12, pred_12)
print("precision_score:   %0.3f" % precision_score)

recall_score = metrics.recall_score(y_test_12, pred_12)
print("recall_score:   %0.3f" % recall_score)

roc_auc_score = metrics.roc_auc_score(y_test_12, pred_12)
print("roc_auc_score:   %0.3f" % roc_auc_score)
fpr, tpr, thresh = metrics.roc_curve(y_test_12, pred_12)

print("classification report:")
print(metrics.classification_report(y_test_12, pred_12))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test_12, pred_12))

print('*'*15 + ' Results on Validation set ' +'*'*15, '\n')
pred_0_12 = model_lr_12.predict(X_val_12)

score = metrics.accuracy_score(y_val_12, pred_0_12)
print("accuracy:   %0.3f" % score)

f1_score = metrics.f1_score(y_val_12, pred_0_12)
print("f1_score:   %0.3f" % f1_score)

precision_score = metrics.precision_score(y_val_12, pred_0_12)
print("precision_score:   %0.3f" % precision_score)

recall_score = metrics.recall_score(y_val_12, pred_0_12)
print("recall_score:   %0.3f" % recall_score)

roc_auc_score = metrics.roc_auc_score(y_val_12, pred_0_12)
print("roc_auc_score:   %0.3f" % roc_auc_score)
fpr, tpr, thresh = metrics.roc_curve(y_val_12, pred_0_12)

print("classification report:")
print(metrics.classification_report(y_val_12, pred_0_12))

print("confusion matrix:")
print(metrics.confusion_matrix(y_val_12, pred_0_12))
print('*'*25 + ' END! ' +'*'*25)

*************** Results on Test set ***************
accuracy:   0.995
f1_score:   0.971
precision_score:   0.990
recall_score:   0.952
roc_auc_score:   0.976
classification report:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1447309
       True       0.99      0.95      0.97    143715

avg / total       0.99      0.99      0.99   1591024

confusion matrix:
[[1445931    1378]
 [   6905  136810]]
*************** Results on Validation set *************** 

accuracy:   0.995
f1_score:   0.970
precision_score:   0.990
recall_score:   0.951
roc_auc_score:   0.975
classification report:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1447647
       True       0.99      0.95      0.97    143377

avg / total       0.99      0.99      0.99   1591024

confusion matrix:
[[1446317    1330]
 [   6965  136412]]
************************* END! *************************


In [35]:
print('*'*15 + ' Results on the entire set ' +'*'*15, '\n')
pred__12 = model_lr_12.predict(X_t)

score = metrics.accuracy_score(y_12, pred__12)
print("accuracy:   %0.3f" % score)

f1_score = metrics.f1_score(y_12, pred__12)
print("f1_score:   %0.3f" % f1_score)

precision_score = metrics.precision_score(y_12, pred__12)
print("precision_score:   %0.3f" % precision_score)

recall_score = metrics.recall_score(y_12, pred__12)
print("recall_score:   %0.3f" % recall_score)

roc_auc_score = metrics.roc_auc_score(y_12, pred__12)
print("roc_auc_score:   %0.3f" % roc_auc_score)
fpr, tpr, thresh = metrics.roc_curve(y_12, pred__12)

print("classification report:")
print(metrics.classification_report(y_12, pred__12))

print("confusion matrix:")
print(metrics.confusion_matrix(y_12, pred__12))
print('*'*25 + ' END! ' +'*'*25)

*************** Results on the entire set *************** 

accuracy:   0.995
f1_score:   0.971
precision_score:   0.991
recall_score:   0.952
roc_auc_score:   0.975
classification report:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00  14474640
       True       0.99      0.95      0.97   1435599

avg / total       0.99      0.99      0.99  15910239

confusion matrix:
[[14461558    13082]
 [   69054  1366545]]
************************* END! *************************


Cross validate the model on the validation set using 

1) accuracy,  <br>
  2) f1_macro,  <br>
  3) recall, and <br>
  4) precision <br>asscoring methods

In [45]:
#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'f1_macro', cv = 10)
#print(scores)
#print('Mean f1 score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'accuracy', cv = 10)
print(scores)
print('Mean accuracy score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'recall', cv = 10)
print(scores)
print('Mean recall score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'precision', cv = 10)
print(scores)
print('Mean precision score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

Add columns for the predictions and the probability of prediction for each title.

In [37]:
data['Predictions'], data['Pred_Prob0'], data['Pred_Prob1']= model_lr_12.predict(X_t), model_lr_12.predict_proba(X_t)[:,0], model_lr_12.predict_proba(X_t)[:,1] 

Save the false Positives and false negatives to a csv file for further analysis.

In [38]:
data[(data['Modified_IsSecurity'] == True) & (data['Predictions'] == False)][['Title','SourceLink','Modified_IsSecurity','Predictions','Pred_Prob0','Pred_Prob1']].to_csv('False Negatives bug,isue,compileETC06-07-18.csv', index = False)
data[(data['Modified_IsSecurity'] == False) & (data['Predictions'] == True)][['Title','SourceLink','Modified_IsSecurity','Predictions','Pred_Prob0','Pred_Prob1']].to_csv('False Positives bug,isue,compileETC06-07-18.csv', index = False)

Save the tfidf and the the model for future references

In [50]:
from sklearn.externals import joblib
joblib.dump(model_lr_12, 'model_lr_12_TFIDF_lda.pkl')
joblib.dump(tv_12, 'tv_12_TFIDF_lda.pkl')

['tv_12_TFIDF_lda.pkl']

In [None]:
X_train_12

In [43]:
model_lr_13 = LogisticRegression(n_jobs = -1)
model_lr_13.fit(X_train_12[:,:((X_train_12.shape)[1]-20)], y_train_12)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
print('*'*15 + ' Results on Test set ' +'*'*15)
pred_12 = model_lr_13.predict(X_test_12[:,:((X_test_12.shape)[1]-20)])

score = metrics.accuracy_score(y_test_12, pred_12)
print("accuracy:   %0.3f" % score)

f1_score = metrics.f1_score(y_test_12, pred_12)
print("f1_score:   %0.3f" % f1_score)

precision_score = metrics.precision_score(y_test_12, pred_12)
print("precision_score:   %0.3f" % precision_score)

recall_score = metrics.recall_score(y_test_12, pred_12)
print("recall_score:   %0.3f" % recall_score)

roc_auc_score = metrics.roc_auc_score(y_test_12, pred_12)
print("roc_auc_score:   %0.3f" % roc_auc_score)
fpr, tpr, thresh = metrics.roc_curve(y_test_12, pred_12)

print("classification report:")
print(metrics.classification_report(y_test_12, pred_12))

print("confusion matrix:")
print(metrics.confusion_matrix(y_test_12, pred_12))

print('*'*15 + ' Results on Validation set ' +'*'*15, '\n')
pred_0_12 = model_lr_13.predict(X_val_12[:,:((X_val_12.shape)[1]-20)])

score = metrics.accuracy_score(y_val_12, pred_0_12)
print("accuracy:   %0.3f" % score)

f1_score = metrics.f1_score(y_val_12, pred_0_12)
print("f1_score:   %0.3f" % f1_score)

precision_score = metrics.precision_score(y_val_12, pred_0_12)
print("precision_score:   %0.3f" % precision_score)

recall_score = metrics.recall_score(y_val_12, pred_0_12)
print("recall_score:   %0.3f" % recall_score)

roc_auc_score = metrics.roc_auc_score(y_val_12, pred_0_12)
print("roc_auc_score:   %0.3f" % roc_auc_score)
fpr, tpr, thresh = metrics.roc_curve(y_val_12, pred_0_12)

print("classification report:")
print(metrics.classification_report(y_val_12, pred_0_12))

print("confusion matrix:")
print(metrics.confusion_matrix(y_val_12, pred_0_12))
print('*'*25 + ' END! ' +'*'*25)

*************** Results on Test set ***************
accuracy:   0.995
f1_score:   0.971
precision_score:   0.990
recall_score:   0.952
roc_auc_score:   0.975
classification report:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1447309
       True       0.99      0.95      0.97    143715

avg / total       0.99      0.99      0.99   1591024

confusion matrix:
[[1445938    1371]
 [   6913  136802]]
*************** Results on Validation set *************** 

accuracy:   0.995
f1_score:   0.970
precision_score:   0.990
recall_score:   0.951
roc_auc_score:   0.975
classification report:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00   1447647
       True       0.99      0.95      0.97    143377

avg / total       0.99      0.99      0.99   1591024

confusion matrix:
[[1446319    1328]
 [   6969  136408]]
************************* END! *************************


Perform a 10-fold cross-balidation and print the results.

In [53]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'recall', cv = 10)
print(scores)
print('Mean recall score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


[ 0.94064309  0.93534212  0.93938337  0.94091797  0.93924386  0.93736049
  0.93680246  0.93694196  0.93882533  0.93805804]
Mean recall score of the 10 scores: 93.84


In [54]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'accuracy', cv = 10)
print(scores)
print('Mean accuracy score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


[ 0.99368338  0.99321199  0.99351988  0.99355131  0.99347588  0.99326847
  0.99337532  0.99326218  0.99336275  0.99340046]
Mean accuracy score of the 10 scores: 99.34


In [55]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'precision', cv = 10)
print(scores)
print('Mean precision score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


[ 0.98870968  0.98871931  0.98811358  0.98690372  0.98774941  0.98728969
  0.98910001  0.98764706  0.98687491  0.98809699]
Mean precision score of the 10 scores: 98.79


In [56]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_lr_12, X_val_12, y_val_12, scoring = 'f1_macro', cv = 10)
print(scores)
print('Mean f1_macro score of the 10 scores: %s'%(round(100*scores.mean(), 2)))

MemoryError: 

In [41]:
data.columns

Index(['Title', 'SourceLink', 'Modified_IsSecurity',
       'RequirementCollectionDID', 'Clean Title', 'Predictions', 'Pred_Prob0',
       'Pred_Prob1'],
      dtype='object')

Get the predictions for each title and the probability attributed to that prediction.

In [40]:
data['Predictions'], data['Pred_Proba0'], data['Pred_Proba1']=model_lr_12.predict(X_12), model_lr_12.predict_proba(X_12)[:,0], model_lr_12.predict_proba(X_12)[:,1] 

ValueError: X has 13844 features per sample; expecting 13864

In [42]:
fn,fp = data[(data['Modified_IsSecurity'] == True) & (data['Predictions'] == False)][['Title', 'SourceLink','Modified_IsSecurity','Predictions','Pred_Proba0','Pred_Proba1','Type']], data[(data['Modified_IsSecurity'] == False) & (data['Predictions'] == True)][['Title', 'SourceLink','Modified_IsSecurity','Predictions','Pred_Proba0','Pred_Proba1','Type']]

KeyError: "['Pred_Proba0' 'Pred_Proba1' 'Type'] not in index"

Group all the Titles, false negatives and false positives by type

In [None]:
grouped_type = pd.DataFrame(data.groupby(by = ['Type']).count()['Title'])
grouped_type_fp = pd.DataFrame(fp.groupby(by = ['Type']).count()['Title'])
grouped_type_fn = pd.DataFrame(fn.groupby(by = ['Type']).count()['Title'])

In [None]:
grouped_type.columns

In [None]:
grouped_type_fn.head()

In [None]:
grouped_type.index

Concatenate all three dataframes

In [None]:
concat_grouped_by = pd.concat([grouped_type, grouped_type_fn, grouped_type_fp], axis = 1)
concat_grouped_by.info()

In [None]:
concat_grouped_by.columns=['Title', 'fn_title', 'fp_title']
concat_grouped_by.head()

In [None]:
concat_grouped_by['Total_Type'] = concat_grouped_by.sum()
concat_grouped_by.head()

Create a column for the percentage of false negatives and false positives for each work Item type

In [None]:
concat_grouped_by['Percent_fn'] = round(100*concat_grouped_by['fn_title']/concat_grouped_by['Title'],3)
concat_grouped_by['Percent_fp'] = round(100*concat_grouped_by['fp_title']/concat_grouped_by['Title'],3)
concat_grouped_by.head()

Rename the columns to more intuitive names and checkout the head of the resulting dataframe.

In [None]:
concat_grouped_by.rename(inplace = True, columns={'Title':'numTitles', 'fn_title':'numFN', 'fp_title':'numFP', 'Total_Type':'Total_Type', 'Percent_fn':'Percent_fn','Percent_fp':'Percent_fp'})

In [None]:
concat_grouped_by.drop('Total_Type', inplace = True, axis = 1)

In [None]:
concat_grouped_by.head()

Save the grouped data to csv files for further analylis

In [None]:
concat_grouped_by.to_csv('False Negatives and false Positives grouped by workItem Type.csv')

Save the false negatives and false positives of this model to a csv file for further anlysis.

In [None]:
fn.to_csv('False Negatives all work item type 06-08-2018.csv', index = False)
fp.to_csv('False Positives all work item type 06-08-2018.csv', index = False)

In [None]:
fp.head()

Save the logistic regression model and its tfidf