In [357]:
'''data'''
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import pprint
from sklearn.pipeline import Pipeline

%run helpers.ipynb

'''Visualization'''
import matplotlib.pyplot as plt
import seaborn as sns

''' Features'''
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

'''Estimators'''
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Modelling'''
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score 
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, confusion_matrix

## Import data

In [358]:
# read the SecReq datasets
CPN = pd.read_csv("../data/CPN.csv", sep=";", names=["text", "labels"])
GPS = pd.read_csv("../data/GPS.csv", sep=";", names=["text", "labels"])
ePurse = pd.read_csv("../data/ePurse_selective.csv", sep=";", names=["text", "labels"])

In [359]:
# read the nfr dataset
nfr = pd.read_csv("../data/nfr.csv")

## Process data in the labels column

**Examine the datasets**

In [360]:
CPN.shape, GPS.shape, ePurse.shape, nfr.shape

((210, 2), (178, 2), (124, 2), (625, 3))

In [361]:
CPN.head()

Unnamed: 0,text,labels
0,"Internally to the CPN, a CNG transmitting pri...","sec,,,,,"
1,"Internally to the CPN, a CNG receiving privat...","sec,,,,,"
2,The CNG shall detect the end of the life of th...,"sec,,,,,,"
3,A CPN-user attempting to invoke a CNG-mediate...,"sec,,,,,,"
4,The CNG shall implement an authentication fail...,"sec,,,,,,"


In [362]:
nfr.head()

Unnamed: 0,ProjectID,text,labels
0,1,The system shall refresh the display every 60 ...,PE
1,1,The application shall match the color of the s...,LF
2,1,If projected the data must be readable. On ...,US
3,1,The product shall be available during normal ...,A
4,1,If projected the data must be understandable...,US


In [363]:
datasets = [("CPN",CPN), ("GPS",GPS), ("ePurse",ePurse), ("nfr",nfr)] # a list of datasets

**Some basic cleaning**

In [364]:
# Clean some of the wierd ",,,,," in labels
def clean_labels(df):
    df["labels"] = df["labels"].map(lambda x: re.sub(",|\"","",str(x)))
    df['labels'] = df['labels'].astype(str)

In [365]:
for df in datasets[:-1]:
    clean_labels(df[1])

**Check to see if there is class imbalances for each dataset**

In [366]:
def level_counts(df):
    # count of each level
    count = df["labels"].value_counts(dropna=False)
    # percentage of each level
    percent = round(df["labels"].value_counts(dropna=False, normalize=True)*100, 3)
    
    # put it into a DataFrame
    return pd.concat([count,percent], axis=1, keys=["count", "percentage"])

In [367]:
for df in datasets:
    print(f"{df[0]}:")
    pprint.pprint(level_counts(df[1]))
    print("\n")

CPN:
        count  percentage
nonsec    167      79.524
sec        41      19.524
nan         2       0.952


GPS:
        count  percentage
nonsec     94      52.809
sec        45      25.281
nan        37      20.787
xyz         1       0.562
            1       0.562


ePurse:
        count  percentage
sec        83      66.935
nonsec     41      33.065


nfr:
    count  percentage
F     255       40.80
US     67       10.72
SE     66       10.56
O      62        9.92
PE     54        8.64
LF     38        6.08
A      21        3.36
SC     21        3.36
MN     17        2.72
L      13        2.08
FT     10        1.60
PO      1        0.16




**Let's drop the rows with label as nan in CPN and blank and xyz in GPS**

In [368]:
CPN = CPN.loc[(CPN["labels"] == "sec") | (CPN["labels"] == "nonsec") , :]  # highly imbalanced: 80% nonsec, 20% sec
GPS = GPS.loc[(GPS["labels"] == "sec") | (GPS["labels"] == "nonsec") , :]  # imbalanced: 67% nonsec, 32% sec
ePurse = ePurse.loc[(ePurse["labels"] == "sec") | (ePurse["labels"] == "nonsec") , :] # imbalanced: 66% nonsec, 33% sec

**For the nfr dataset, there are some labels with very little samples.  I can try to group all classes with less than 7% to a single "other" class"**

In [369]:
temp = level_counts(nfr)
others=list(temp[temp.percentage<7].index) # a list of classes that I want to group as "others"

In [370]:
nfr.loc[nfr.labels.isin(others), "labels"]= "others" 

In [371]:
level_counts(nfr)

Unnamed: 0,count,percentage
F,255,40.8
others,121,19.36
US,67,10.72
SE,66,10.56
O,62,9.92
PE,54,8.64


**Combine CPN, GPS, ePurse datasets into a single SeqReq dataset.  We will perform binary classification on this dataset**

In [372]:
SecReq = pd.concat([CPN,GPS,ePurse], axis=0)

In [373]:
SecReq = SecReq.sample(frac=1).reset_index(drop=True)

In [374]:
SecReq.head()

Unnamed: 0,text,labels
0,"A credit command, which contains the S2 MAC co...",sec
1,CPN environment shall be protected with a stat...,sec
2,For the CNG management it is recommended to us...,nonsec
3,All other commands (including SELECT commands...,nonsec
4,Codec requirements and capabilities for the Us...,nonsec


## Process software requirements text
- Use spaCy to remove puncutation, non-alphanumric data, and elmmatize the text.
- Train and apply first-order phrase model to join word pairs (get bigrams).
- Train and apply second-order phrase model to join word triplets (get trigrams).
- Remove stopwords.
- Create tf-idf representations.

Useful reference: https://towardsdatascience.com/turbo-charge-your-spacy-nlp-pipeline-551435b664ad

In [375]:
nlp = spacy.load('en_core_web_sm')

In [376]:
%%time
# Process text
process_text(SecReq)
process_text(nfr)

Wall time: 5.21 s


## Phrase Modelling
- Learn combinations of tokens that together represents meaningful multi-word phrases ("United States", "happy hour")
- Use gensim to develop phrase models phrase models are developed by examining all the words in The headlines and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance.
- Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Useful reference: https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-7-phrase-modeling-doc2vec-592a8a996867

In [377]:
# Train Phraser model, get bigrams
model_filepath = "../models/secreq_bigrams_model" # bigram model file path for SecReq
get_bigrams(SecReq, model_filepath, True)

model_filepath = "../models/nfr_bigrams_model" # bigram model file path for nfr
get_bigrams(nfr, model_filepath, True)

In [378]:
# Train Phraser model, get trigrams
model_filepath = "../models/secreq_trigrams_model" # bigram model file path for SecReq
get_trigrams(SecReq, model_filepath, True)

model_filepath = "../models/nfr_trigrams_model" # bigram model file path for nfr
get_trigrams(nfr, model_filepath, True)

In [379]:
SecReq.sample(5)

Unnamed: 0,text,labels,processed_text,bigrams,trigrams
384,The CNG shall support mechanisms for secure au...,nonsec,"[cng, shall, support, mechanism, secure, authe...","[cng, shall, support_mechanism, secure, authen...","[cng, shall_support_mechanism, secure, authent..."
328,The GlobalPlatform Environment (OPEN) support...,nonsec,"[globalplatform, environment, open, support, c...","[globalplatform, environment, open, support, c...","[globalplatform, environment, open, support, c..."
91,The management of keys must be accomplished in...,sec,"[management, key, accomplish, secure, certifie...","[management, key, accomplish, secure, certifie...","[management, key, accomplish, secure, certifie..."
430,A GlobalPlatform card should support symmetri...,sec,"[globalplatform, card, support, symmetric, cry...","[globalplatform, card, support, symmetric, cry...","[globalplatform, card, support, symmetric, cry..."
431,The CND should support protocols for remote ac...,nonsec,"[cnd, support, protocol, remote, access, manag...","[cnd, support, protocol, remote_access, manage...","[cnd, support, protocol, remote_access, manage..."


In [380]:
nfr.sample(5)

Unnamed: 0,ProjectID,text,labels,processed_text,bigrams,trigrams
51,2,The product shall be installed by an untrained...,US,"[product, shall, instal, untrained, realtor, r...","[product, shall, instal, untrained, realtor, r...","[product, shall, instal, untrained, realtor, r..."
382,8,System shall let Izogn Manager access sales an...,PE,"[system, shall, let, izogn, manager, access, s...","[system, shall, let, izogn, manager, access, s...","[system, shall, let, izogn, manager, access, s..."
462,8,All streaming movie sales will be logged in t...,F,"[stream, movie, sale, log, database, accessibl...","[stream_movie, sale, log, database, accessible...","[stream_movie, sale, log, database, accessible..."
136,3,The system shall contain contact information ...,F,"[system, shall, contain, contact, information,...","[system, shall, contain, contact_information, ...","[system, shall, contain, contact_information, ..."
595,13,The system shall only be accessed by authorize...,SE,"[system, shall, access, authorize, corporate, ...","[system, shall, access, authorize, corporate, ...","[system, shall, access, authorize, corporate, ..."


In [381]:
# Create a variant of the nfr dataset in which there are only two classes: functional (F) and non-functional (NF)
nfr_binary = nfr.copy()
nfr_binary["labels"].replace(["others","US","SE","O","PE"], "NF", inplace=True)

In [382]:
# encode the labels to numeric value
encode_label(SecReq)
encode_label(nfr)
encode_label(nfr_binary)

In [383]:
secreq_texts = SecReq['trigrams'].astype('str')
nfr_texts = nfr_binary_texts = nfr['trigrams'].astype('str')

## Binary Classification for the SecReq dataset

**Pipeline for modelling**
- Convert the transformed text (with trigrams) to a matrix of tf-idf weights (features)
- The matrix would contain the tf-idf scores of each token t in each of the document in the corpus.
- Perform TruncatedSVD to reduce the number of features-- this is important since our sample size is small, a large feature set will lead to overfitting.
- Modelling: use RandomForest and Adaboost
- Tune hyperparameters with GridSearchCV

In [384]:
# Test harness
X = secreq_texts
y = SecReq['labels_num'].values 

# Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state = 10)

In [385]:
# Classifiers
models = {
    'SVC': SVC(random_state=10),
    'RandomForestClassifier': RandomForestClassifier(random_state=10),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=10)
}

In [386]:
# Parameters for GridSearchCV
# Use a dictionary of dictionaries to set the param grid for each of the models
'''
SVC:
C = the regularization parameter, penalize model complexity
gamma = kernel coefficient for ‘rbf’ 


Random Forest:
n_estimators = number of trees in the foreset
max_features = max number of features considered for splitting a node
max_depth = max number of levels in each decision tree
min_samples_split = min number of data points placed in a node before the node is split
min_samples_leaf = min number of data points allowed in a leaf node

AdaBoost:
n_estimators = number of estimators in the boosted ensemble to use.
learning_rate= Learning rate shrinks the contribution of each classifier by learning_rate. 
'''       

# Parameters for GridSearchCV
params = {
    
    'SVC':  { 
        "clf__C" : [ 1, 10, 100, 1000], # [1,10,100,1000]
        "clf__gamma" : [1,0.1,0.001,0.0001], # [1,0.1,0.001,0.0001]
    },
    
    'RandomForestClassifier':  { 
        "clf__n_estimators" : [200, 400, 800, 1000, 1200], # [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800] 
        "clf__max_depth"      : [10,30,50,80,100],  # [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
        "clf__min_samples_split" : [3,5,6],  # [2, 5, 10]
        "clf__min_samples_leaf" :   [2,3]   # [1, 2, 4]  
    },
    
    'AdaBoostClassifier':  { 
        "clf__n_estimators" : [100, 500, 800, 1000], # [100, 500, 800, 1000]
        "clf__learning_rate" : [0.01,0.05,0.1,0.3,1] # [0.01,0.05,0.1,0.3,1]
    }
}

**Modelling**

In [387]:
df_secreq_tfidf = models_training("SecReq","tfidf", X_train, y_train, X_test, y_test, models, params, True)

In [388]:
df_secreq_tfidf

Unnamed: 0,dataset,wv_type,model_name,accuracy_score,precision_score,recall_score,f1_score
0,SecReq,tfidf,SVC,0.957746,0.954105,0.954105,0.954105
2,SecReq,tfidf,AdaBoostClassifier,0.957746,0.958086,0.949795,0.953696
1,SecReq,tfidf,RandomForestClassifier,0.93662,0.94782,0.916074,0.928822


**Thoughts:**
- I should select SVC since it requires the least training time and it has the best performance (on the test data)**

## Binary Classification for the nfr_binary dataset

In [389]:
# Test harness
X = nfr_binary_texts
y = nfr_binary['labels_num'].values 

# Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state = 10)

**Modelling**

In [390]:
df_nfr_binary_tfidf = models_training("nfr_binary","tfidf", X_train, y_train, X_test, y_test, models, params, True)

In [391]:
df_nfr_binary_tfidf

Unnamed: 0,dataset,wv_type,model_name,accuracy_score,precision_score,recall_score,f1_score
0,nfr_binary,tfidf,SVC,0.87766,0.875415,0.870539,0.87273
1,nfr_binary,tfidf,RandomForestClassifier,0.845745,0.864589,0.821633,0.832427
2,nfr_binary,tfidf,AdaBoostClassifier,0.803191,0.811198,0.77963,0.787558


## Let's try to represent software requirements with pre-trained Sentence Embeddings 
Useful Reference: https://radimrehurek.com/gensim/models/keyedvectors.html

In [37]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [38]:
%%time 
# Load KeyedVectors for the Google News word embeddings
# Contains 3 million 300-D word embeddings trained from 100 billion words

# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
# The vectors is loaded from an existing file on disk in the original Google’s word2vec C format as 
#a KeyedVectors instance
googlew2v = KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', 
                                                            binary=True) # C bin format

Wall time: 4min 2s


Since trained word vectors are independent from the way they were trained (Word2Vec, FastText, WordRank, VarEmbed etc), they can be represented by a standalone structure. The structure is called “KeyedVectors” and is essentially a mapping between entities and vectors. Each entity is identified by its string id, so this is a mapping between {str => 1D numpy array}. Note that with KeyVectors, I cannot continue to train the vectors.  However, KeyedVectors are smaller and need less RAM since htey don't need to store the model state that enbales training. 

In [399]:
# Convert the transformed requirements from each dataset a list of lists of tokens
nfr_docs = nfr.trigrams.tolist()
secreq_docs = SecReq.trigrams.tolist()

In [400]:
def get_sentence_embeddings(doc):
    # only keep tokens in each sentence if they are a vocab in googlew2v
    tokens_in_embedding= [t for t in doc if t in googlew2v] 
    sent_embedding = np.average([googlew2v[token] for token in tokens_in_embedding], axis=0)
    return sent_embedding

In [401]:
# Get the sentence embeddings of each of the 625 requirements in the nfr dataset
# For each requirements, I get the 300x1 word embedding for each token, and then take the average 
# to get a 300x1 sentence embedding
sent_embeddings_nfr= [get_sentence_embeddings(doc) for doc in nfr_docs] # a list of 625 ndarrays of 300 elements
sent_embeddings_nfr = np.concatenate(sent_embeddings_nfr, axis=0) # an ndarray of 625x300=187500 elements
sent_embeddings_nfr = sent_embeddings_nfr.reshape((625,300))

In [403]:
sent_embeddings_secreq = [get_sentence_embeddings(doc) for doc in secreq_docs] # a list of 471 ndarrays of 300 elements
sent_embeddings_secreq = np.concatenate(sent_embeddings_secreq, axis=0) # an ndarray of 471x300=141300 elements
sent_embeddings_secreq = sent_embeddings_secreq.reshape((471,300))

In [404]:
# Test harness
X = sent_embeddings_secreq
y = SecReq['labels_num'].values 

# Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state = 10)

In [409]:
df_secreq_wv = models_training("SecReq","pre_trained_wv", X_train, y_train, X_test, y_test, models, params, True)

In [410]:
df_secreq_wv

Unnamed: 0,dataset,wv_type,model_name,accuracy_score,precision_score,recall_score,f1_score
0,SecReq,pre_trained_wv,SVC,0.971831,0.973667,0.965094,0.96913
1,SecReq,pre_trained_wv,RandomForestClassifier,0.964789,0.968307,0.95529,0.961234
2,SecReq,pre_trained_wv,AdaBoostClassifier,0.957746,0.958086,0.949795,0.953696
