In [802]:
'''data'''
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import pprint

%run helpers.ipynb

'''Visualization'''
import matplotlib.pyplot as plt
import seaborn as sns

''' Features'''
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

'''Estimators'''
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Modelling'''
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score 
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, confusion_matrix

## Import data

In [803]:
# read the SecReq datasets
CPN = pd.read_csv("data/CPN.csv", sep=";", names=["text", "labels"])
GPS = pd.read_csv("data/GPS.csv", sep=";", names=["text", "labels"])
ePurse = pd.read_csv("data/ePurse_selective.csv", sep=";", names=["text", "labels"])

In [804]:
# read the nfr dataset
nfr = pd.read_csv("data/nfr.csv")

## Process data in the labels column

**Examine the datasets**

In [805]:
CPN.shape, GPS.shape, ePurse.shape, nfr.shape

((210, 2), (178, 2), (124, 2), (625, 3))

In [806]:
CPN.head()

Unnamed: 0,text,labels
0,"Internally to the CPN, a CNG transmitting pri...","sec,,,,,"
1,"Internally to the CPN, a CNG receiving privat...","sec,,,,,"
2,The CNG shall detect the end of the life of th...,"sec,,,,,,"
3,A CPN-user attempting to invoke a CNG-mediate...,"sec,,,,,,"
4,The CNG shall implement an authentication fail...,"sec,,,,,,"


In [807]:
nfr.head()

Unnamed: 0,ProjectID,text,labels
0,1,The system shall refresh the display every 60 ...,PE
1,1,The application shall match the color of the s...,LF
2,1,If projected the data must be readable. On ...,US
3,1,The product shall be available during normal ...,A
4,1,If projected the data must be understandable...,US


In [808]:
datasets = [("CPN",CPN), ("GPS",GPS), ("ePurse",ePurse), ("nfr",nfr)] # a list of datasets

**Some basic cleaning**

In [809]:
# Clean some of the wierd ",,,,," in labels
def clean_labels(df):
    df["labels"] = df["labels"].map(lambda x: re.sub(",|\"","",str(x)))
    df['labels'] = df['labels'].astype(str)

In [810]:
for df in datasets[:-1]:
    clean_labels(df[1])

**Check to see if there is class imbalances for each dataset**

In [811]:
def level_counts(df):
    # count of each level
    count = df["labels"].value_counts(dropna=False)
    # percentage of each level
    percent = round(df["labels"].value_counts(dropna=False, normalize=True)*100, 3)
    
    # put it into a DataFrame
    return pd.concat([count,percent], axis=1, keys=["count", "percentage"])

In [812]:
for df in datasets:
    print(f"{df[0]}:")
    pprint.pprint(level_counts(df[1]))
    print("\n")

CPN:
        count  percentage
nonsec    167      79.524
sec        41      19.524
nan         2       0.952


GPS:
        count  percentage
nonsec     94      52.809
sec        45      25.281
nan        37      20.787
xyz         1       0.562
            1       0.562


ePurse:
        count  percentage
sec        83      66.935
nonsec     41      33.065


nfr:
    count  percentage
F     255       40.80
US     67       10.72
SE     66       10.56
O      62        9.92
PE     54        8.64
LF     38        6.08
SC     21        3.36
A      21        3.36
MN     17        2.72
L      13        2.08
FT     10        1.60
PO      1        0.16




**Let's drop the rows with label as nan in CPN and blank and xyz in GPS**

In [813]:
CPN = CPN.loc[(CPN["labels"] == "sec") | (CPN["labels"] == "nonsec") , :]  # highly imbalanced: 80% nonsec, 20% sec
GPS = GPS.loc[(GPS["labels"] == "sec") | (GPS["labels"] == "nonsec") , :]  # imbalanced: 67% nonsec, 32% sec
ePurse = ePurse.loc[(ePurse["labels"] == "sec") | (ePurse["labels"] == "nonsec") , :] # imbalanced: 66% nonsec, 33% sec

**For the nfr dataset, there are some labels with very little samples.  I can try to group all classes with less than 7% to a single "other" class"**

In [814]:
temp = level_counts(nfr)
others=list(temp[temp.percentage<7].index) # a list of classes that I want to group as "others"

In [815]:
nfr.loc[nfr.labels.isin(others), "labels"]= "others" 

In [816]:
level_counts(nfr)

Unnamed: 0,count,percentage
F,255,40.8
others,121,19.36
US,67,10.72
SE,66,10.56
O,62,9.92
PE,54,8.64


**Combine CPN, GPS, ePurse datasets into a single SeqReq dataset.  We will perform binary classification on this dataset**

In [817]:
SecReq = pd.concat([CPN,GPS,ePurse], axis=0)

In [818]:
SecReq = SeqReq.sample(frac=1).reset_index(drop=True)

In [819]:
SecReq.head()

Unnamed: 0,text,labels
0,A proof must be sent to the load acquirer that...,sec
1,The CNG may interact with network elements ins...,nonsec
2,Multiple public identities on the same termina...,nonsec
3,If the POS device supports multiple applicatio...,nonsec
4,Once the card session has been established (f...,nonsec


## Process software requirements text
- Use spaCy to remove puncutation, non-alphanumric data, and elmmatize the text.
- Train and apply first-order phrase model to join word pairs (get bigrams).
- Train and apply second-order phrase model to join word triplets (get trigrams).
- Remove stopwords.
- Create tf-idf representations.

Useful reference: https://towardsdatascience.com/turbo-charge-your-spacy-nlp-pipeline-551435b664ad

In [820]:
nlp = spacy.load('en_core_web_sm')

In [821]:
%%time
# Process text
process_text(SecReq)
process_text(nfr)

Wall time: 5.31 s


## Phrase Modelling
- Learn combinations of tokens that together represents meaningful multi-word phrases ("United States", "happy hour")
- Use gensim to develop phrase models phrase models are developed by examining all the words in The headlines and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance.
- Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Useful reference: https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-7-phrase-modeling-doc2vec-592a8a996867

In [822]:
# Train Phraser model, get bigrams
model_filepath = "models/secreq_bigrams_model" # bigram model file path for SecReq
get_bigrams(SecReq, model_filepath, True)

model_filepath = "models/nfr_bigrams_model" # bigram model file path for nfr
get_bigrams(nfr, model_filepath, True)

In [823]:
# Train Phraser model, get trigrams
model_filepath = "models/secreq_trigrams_model" # bigram model file path for SecReq
get_trigrams(SecReq, model_filepath, True)

model_filepath = "models/nfr_trigrams_model" # bigram model file path for nfr
get_trigrams(nfr, model_filepath, True)

In [826]:
SecReq.sample(5)

Unnamed: 0,text,labels,processed_text,bigrams,trigrams
159,Authorizations for load transactions require a...,sec,"[authorization, load, transaction, require, fo...","[authorization, load, transaction, require, fo...","[authorization, load, transaction, require, fo..."
106,When forwarding packets from the customer netw...,nonsec,"[forward, packet, customer, network, wan, inte...","[forward_packet, customer_network, wan_interfa...","[forward_packet, customer_network, wan_interfa..."
183,The CND should support protocol for local auth...,nonsec,"[cnd, support, protocol, local, authentication...","[cnd, support, protocol, local, authentication...","[cnd, support, protocol, local, authentication..."
356,The GlobalPlatform Environment (OPEN) support...,nonsec,"[globalplatform, environment, open, support, c...","[globalplatform, environment, open, support, c...","[globalplatform, environment, open, support, c..."
241,CNDs connected to the CPN and devoted to speci...,nonsec,"[cnd, connect, cpn, devote, specific, manage, ...","[cnd, connect, cpn, devote, specific, manage, ...","[cnd, connect, cpn, devote, specific, manage, ..."


In [827]:
nfr.sample(5)

Unnamed: 0,ProjectID,text,labels,processed_text,bigrams,trigrams
542,10,When a game is ended the product shall allow...,F,"[game, end, product, shall, allow, player, rem...","[game_end, product, shall, allow, player, remo...","[game_end, product, shall, allow, player, remo..."
503,10,The product shall provide players no access to...,SE,"[product, shall, provide, player, access, info...","[product, shall, provide, player, access, info...","[product, shall, provide, player, access, info..."
124,3,A clinical lab section shall include the clin...,F,"[clinical, lab, section, shall, include, clini...","[clinical_lab, section, shall, include, clinic...","[clinical_lab_section, shall, include, clinica..."
475,9,Vendor will submit a credit validation record ...,F,"[vendor, submit, credit, validation, record, a...","[vendor, submit, credit, validation, record, a...","[vendor, submit, credit, validation, record, a..."
421,8,The system will provide a 24 hour 800 toll fre...,others,"[system, provide, hour, toll, free, number, su...","[system, provide, hour, toll, free, number, su...","[system, provide, hour, toll, free, number, su..."


## Convert the transformed text (with trigrams) to a matrix of tf-idf weights (features)
- The matrix would contain the tf-idf scores of each token t in each of the document in the corpus.

In [841]:
# Creating the features (tf-idf weights) for the processed text
secreq_texts = SecReq['trigrams'].astype('str')
nfr_texts = nfr['trigrams'].astype('str')

In [837]:
# Init the tf-idf vectorizer

'''
ngram_range: The lower and upper boundary of the range of n-values for different n-grams to be extracted. 
All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, 
(1, 2) means unigrams and bigrams, and (2, 2) means only bigrams. 
'''

# init TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

In [844]:
# Train the tf-idf vectorizer and fit to the clean texts (a series of a list of strings)
# I will use the tf-idf representation of headlines as my features
X = tfidf_vectorizer.fit_transform(secreq_texts)

In [845]:
X.shape

(471, 8076)

**Use TruncatedSVD to perform dimensionality reduction-- I only use 100 features instead of around 8076**

In [846]:
# Init the TruncatedSVD model
# TruncateSVD is used for dimenionality reduction for X. I only use 100 features
lsa = TruncatedSVD(n_components=100, 
                   n_iter=10, 
                   random_state=3)

X = lsa.fit_transform(X)
X.shape

(471, 100)

In [848]:
X

array([[ 0.01314077,  0.00918034,  0.01138384, ...,  0.00203362,
        -0.07969179, -0.05836899],
       [ 0.08867275, -0.0531916 , -0.0299858 , ..., -0.01467482,
        -0.04963013, -0.01008873],
       [ 0.14423432, -0.02764229, -0.05555691, ...,  0.07179054,
         0.03689323,  0.00600688],
       ...,
       [ 0.1654195 ,  0.30762268,  0.02453372, ...,  0.00285312,
        -0.02059786, -0.00939888],
       [ 0.11030984, -0.03670059, -0.04327069, ..., -0.05852711,
        -0.05542862, -0.03661431],
       [ 0.04373733,  0.02606767,  0.06723284, ...,  0.00612188,
        -0.05606473,  0.00359076]])

## Modelling