In [397]:
'''data'''
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import pprint
from sklearn.pipeline import Pipeline

%run helpers.ipynb

'''Visualization'''
import matplotlib.pyplot as plt
import seaborn as sns

''' Features'''
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

'''Estimators'''
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Modelling'''
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score 
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, confusion_matrix

## Import data

In [398]:
# read the SecReq datasets
CPN = pd.read_csv("../data/CPN.csv", sep=";", names=["text", "labels"])
GPS = pd.read_csv("../data/GPS.csv", sep=";", names=["text", "labels"])
ePurse = pd.read_csv("../data/ePurse_selective.csv", sep=";", names=["text", "labels"])

In [399]:
# read the nfr dataset
nfr = pd.read_csv("../data/nfr.csv")

## Process data in the labels column

**Examine the datasets**

In [400]:
CPN.shape, GPS.shape, ePurse.shape, nfr.shape

((210, 2), (178, 2), (124, 2), (625, 3))

In [401]:
CPN.head()

Unnamed: 0,text,labels
0,"Internally to the CPN, a CNG transmitting pri...","sec,,,,,"
1,"Internally to the CPN, a CNG receiving privat...","sec,,,,,"
2,The CNG shall detect the end of the life of th...,"sec,,,,,,"
3,A CPN-user attempting to invoke a CNG-mediate...,"sec,,,,,,"
4,The CNG shall implement an authentication fail...,"sec,,,,,,"


In [402]:
nfr.head()

Unnamed: 0,ProjectID,text,labels
0,1,The system shall refresh the display every 60 ...,PE
1,1,The application shall match the color of the s...,LF
2,1,If projected the data must be readable. On ...,US
3,1,The product shall be available during normal ...,A
4,1,If projected the data must be understandable...,US


In [403]:
datasets = [("CPN",CPN), ("GPS",GPS), ("ePurse",ePurse), ("nfr",nfr)] # a list of datasets

**Some basic cleaning**

In [404]:
# Clean some of the wierd ",,,,," in labels
def clean_labels(df):
    df["labels"] = df["labels"].map(lambda x: re.sub(",|\"","",str(x)))
    df['labels'] = df['labels'].astype(str)

In [405]:
for df in datasets[:-1]:
    clean_labels(df[1])

**Check to see if there is class imbalances for each dataset**

In [406]:
def level_counts(df):
    # count of each level
    count = df["labels"].value_counts(dropna=False)
    # percentage of each level
    percent = round(df["labels"].value_counts(dropna=False, normalize=True)*100, 3)
    
    # put it into a DataFrame
    return pd.concat([count,percent], axis=1, keys=["count", "percentage"])

In [407]:
for df in datasets:
    print(f"{df[0]}:")
    pprint.pprint(level_counts(df[1]))
    print("\n")

CPN:
        count  percentage
nonsec    167      79.524
sec        41      19.524
nan         2       0.952


GPS:
        count  percentage
nonsec     94      52.809
sec        45      25.281
nan        37      20.787
xyz         1       0.562
            1       0.562


ePurse:
        count  percentage
sec        83      66.935
nonsec     41      33.065


nfr:
    count  percentage
F     255       40.80
US     67       10.72
SE     66       10.56
O      62        9.92
PE     54        8.64
LF     38        6.08
SC     21        3.36
A      21        3.36
MN     17        2.72
L      13        2.08
FT     10        1.60
PO      1        0.16




**Let's drop the rows with label as nan in CPN and blank and xyz in GPS**

In [408]:
CPN = CPN.loc[(CPN["labels"] == "sec") | (CPN["labels"] == "nonsec") , :]  # highly imbalanced: 80% nonsec, 20% sec
GPS = GPS.loc[(GPS["labels"] == "sec") | (GPS["labels"] == "nonsec") , :]  # imbalanced: 67% nonsec, 32% sec
ePurse = ePurse.loc[(ePurse["labels"] == "sec") | (ePurse["labels"] == "nonsec") , :] # imbalanced: 66% nonsec, 33% sec

**For the nfr dataset, there are some labels with very little samples.  I can try to group all classes with less than 7% to a single "other" class"**

In [409]:
temp = level_counts(nfr)
others=list(temp[temp.percentage<7].index) # a list of classes that I want to group as "others"

In [410]:
nfr.loc[nfr.labels.isin(others), "labels"]= "others" 

In [411]:
level_counts(nfr)

Unnamed: 0,count,percentage
F,255,40.8
others,121,19.36
US,67,10.72
SE,66,10.56
O,62,9.92
PE,54,8.64


**Combine CPN, GPS, ePurse datasets into a single SeqReq dataset.  We will perform binary classification on this dataset**

In [412]:
SecReq = pd.concat([CPN,GPS,ePurse], axis=0)

In [413]:
SecReq = SecReq.sample(frac=1).reset_index(drop=True)

In [414]:
SecReq.head()

Unnamed: 0,text,labels
0,The amount to be reversed is the amount of the...,sec
1,Since these transactions are on-line to the is...,sec
2,The card verification method indicator (CVMI) ...,sec
3,The CEP card verifies that the PSAM is the PSA...,sec
4,CNG shall perform a link layer multicast to un...,nonsec


## Process software requirements text
- Use spaCy to remove puncutation, non-alphanumric data, and elmmatize the text.
- Train and apply first-order phrase model to join word pairs (get bigrams).
- Train and apply second-order phrase model to join word triplets (get trigrams).
- Remove stopwords.
- Create tf-idf representations.

Useful reference: https://towardsdatascience.com/turbo-charge-your-spacy-nlp-pipeline-551435b664ad

In [415]:
nlp = spacy.load('en_core_web_sm')

In [416]:
%%time
# Process text
process_text(SecReq)
process_text(nfr)

Wall time: 4.72 s


## Phrase Modelling
- Learn combinations of tokens that together represents meaningful multi-word phrases ("United States", "happy hour")
- Use gensim to develop phrase models phrase models are developed by examining all the words in The headlines and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance.
- Once our phrase model has been trained on our corpus, we can apply it to new text. When our model encounters two tokens in new text that identifies as a phrase, it will merge the two into a single new token.

Useful reference: https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-7-phrase-modeling-doc2vec-592a8a996867

In [417]:
# Train Phraser model, get bigrams
model_filepath = "../models/secreq_bigrams_model" # bigram model file path for SecReq
get_bigrams(SecReq, model_filepath, True)

model_filepath = "../models/nfr_bigrams_model" # bigram model file path for nfr
get_bigrams(nfr, model_filepath, True)

In [418]:
# Train Phraser model, get trigrams
model_filepath = "../models/secreq_trigrams_model" # bigram model file path for SecReq
get_trigrams(SecReq, model_filepath, True)

model_filepath = "../models/nfr_trigrams_model" # bigram model file path for nfr
get_trigrams(nfr, model_filepath, True)

In [419]:
SecReq.sample(5)

Unnamed: 0,text,labels,processed_text,bigrams,trigrams
46,Multiple identities should be supported within...,nonsec,"[multiple, identity, supported, subscription, ...","[multiple, identity, supported, subscription, ...","[multiple, identity, supported, subscription, ..."
280,The CNG shall support both routed and bridged ...,nonsec,"[cng, shall, support, route, bridge, mode, ope...","[cng, shall, support, route, bridge, mode_oper...","[cng, shall, support, route, bridge, mode_oper..."
405,The CNG shall support mechanisms for managing ...,nonsec,"[cng, shall, support, mechanism, manage, iptv,...","[cng, shall, support_mechanism, manage, iptv, ...","[cng, shall_support_mechanism, manage, iptv, f..."
210,Once an Application becomes the selected Appl...,nonsec,"[application, select, application, basic, logi...","[application, select_application, basic_logica...","[application, select_application_basic_logical..."
366,The Application Provider is responsible for R...,nonsec,"[application, provider, responsible, return, r...","[application, provider, responsible, return, r...","[application, provider, responsible, return, r..."


In [420]:
nfr.sample(5)

Unnamed: 0,ProjectID,text,labels,processed_text,bigrams,trigrams
398,8,When streaming a movie the buffering time sh...,PE,"[stream, movie, buffering, time, longer, secon...","[stream_movie, buffering, time, longer_second,...","[stream_movie, buffering, time, longer_second,..."
279,6,The product must make use of web/application ...,O,"[product, use, web, application, server, techn...","[product, use, web, application, server, techn...","[product, use, web, application, server, techn..."
548,11,The system shall have high availability every ...,others,"[system, shall, high, availability, day, year,...","[system, shall, high, availability, day, year,...","[system, shall, high, availability, day, year,..."
142,3,The system will notify affected parties when ...,F,"[system, notify, affected, party, change, occu...","[system, notify, affected, party, change, occu...","[system, notify, affected, party, change, occu..."
78,3,The system shall be available for use between ...,others,"[system, shall, available, use, hour, p.m.]","[system, shall, available, use, hour, p.m.]","[system, shall, available, use, hour, p.m.]"


In [421]:
# Create a variant of the nfr dataset in which there are only two classes: functional (F) and non-functional (NF)
nfr_binary = nfr.copy()
nfr_binary["labels"].replace(["others","US","SE","O","PE"], "NF", inplace=True)

In [422]:
# encode the labels to numeric value
encode_label(SecReq)
encode_label(nfr)
encode_label(nfr_binary)

In [423]:
secreq_texts = SecReq['trigrams'].astype('str')
nfr_texts = nfr_binary_texts = nfr['trigrams'].astype('str')

## Binary Classification for the SecReq dataset

**Pipeline for modelling**
- Convert the transformed text (with trigrams) to a matrix of tf-idf weights (features)
- The matrix would contain the tf-idf scores of each token t in each of the document in the corpus.
- Perform TruncatedSVD to reduce the number of features-- this is important since our sample size is small, a large feature set will lead to overfitting.
- Modelling: use RandomForest and Adaboost
- Tune hyperparameters with GridSearchCV

In [424]:
# Test harness
X = secreq_texts
y = SecReq['labels_num'].values 

# Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state = 10)

In [425]:
# Classifiers
models = {
    'SVC': SVC(random_state=10),
    'RandomForestClassifier': RandomForestClassifier(random_state=10, max_features= "auto"),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=10)
}

In [426]:
# Parameters for GridSearchCV
# Use a dictionary of dictionaries to set the param grid for each of the models
'''
SVC:
C = the regularization parameter, penalize model complexity
gamma = kernel coefficient for ‘rbf’ 


Random Forest:
n_estimators = number of trees in the foreset
max_features = max number of features considered for splitting a node
max_depth = max number of levels in each decision tree
min_samples_split = min number of data points placed in a node before the node is split
min_samples_leaf = min number of data points allowed in a leaf node

AdaBoost:
n_estimators = number of estimators in the boosted ensemble to use.
learning_rate= Learning rate shrinks the contribution of each classifier by learning_rate. 
'''       

params = {
    
    'SVC':  { 
        "clf__C" : [ 80, 100, 200],
        "clf__gamma" : [0.05,0.1, 0.15],
    },
    
    'RandomForestClassifier':  { 
        "clf__n_estimators" : [800, 1000, 1200],
        "clf__max_depth"      : [20, 30, 40],
        "clf__min_samples_split" : [3,5,6],
        "clf__min_samples_leaf" :   [2,3]     
    },
    
    'AdaBoostClassifier':  { 
        "clf__n_estimators" : [250, 500, 600],
        "clf__learning_rate" : [0.8,1,1.5],
    }
}


**Modelling**

In [435]:
df_secreq_tfidf = models_training("SecReq","tfidf", X_train, y_train, X_test, y_test, models, params, False)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    7.5s finished


The best parameters are: {'clf__C': 100, 'clf__gamma': 0.05}

The best accuracy score (on CV) are: 0.8443834900731453
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   57.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:  6.6min finished


The best parameters are: {'clf__max_depth': 20, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 3, 'clf__n_estimators': 1000}

The best accuracy score (on CV) are: 0.8009143155694879
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   55.7s finished


The best parameters are: {'clf__learning_rate': 1.5, 'clf__n_estimators': 250}

The best accuracy score (on CV) are: 0.8420062695924765


In [428]:
df_secreq_tfidf

Unnamed: 0,dataset,wv_type,model_name,accuracy_score,precision_score,recall_score,f1_score
0,SecReq,tfidf,SVC,0.93662,0.929701,0.933312,0.931449
2,SecReq,tfidf,AdaBoostClassifier,0.866197,0.865492,0.839582,0.849735
1,SecReq,tfidf,RandomForestClassifier,0.84507,0.868358,0.797242,0.816537


**Thoughts:**
- I should select SVC since it requires the least training time and it has the best performance (on the test data)**

## Binary Classification for the nfr_binary dataset

In [429]:
# Test harness
X = nfr_binary_texts
y = nfr_binary['labels_num'].values 

# Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state = 10)

In [436]:
# Classifiers
models = {
    'SVC': SVC(random_state=10),
    'RandomForestClassifier': RandomForestClassifier(random_state=10, max_features= "auto"),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=10)
}

# Parameters for GridSearchCV
params = {
    
    'SVC':  { 
        "clf__C" : [50, 100, 200, 300],
        "clf__gamma" : [0.05,0.1, 0.5, 1, 1.5],
    },
    
    'RandomForestClassifier':  { 
        "clf__n_estimators" : [200, 800, 1000, 1200],
        "clf__max_depth"      : [10,30,80,100],
        "clf__min_samples_split" : [1,2,4],
        "clf__min_samples_leaf" :   [2,5,10]     
    },
    
    'AdaBoostClassifier':  { 
        "clf__n_estimators" : [100, 500, 800, 1200],
        "clf__learning_rate" : [0.1,0.5,1,1.5],
    }
}


**Modelling**

In [437]:
df_nfr_binary_tfidf = models_training("nfr_binary","tfidf", X_train, y_train, X_test, y_test, models, params, False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.2s finished


The best parameters are: {'clf__C': 50, 'clf__gamma': 1}

The best accuracy score (on CV) are: 0.8603970741901776
Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  9.9min finished


The best parameters are: {'clf__max_depth': 30, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__n_estimators': 1000}

The best accuracy score (on CV) are: 0.8009143155694879
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.2min finished


The best parameters are: {'clf__learning_rate': 0.1, 'clf__n_estimators': 800}

The best accuracy score (on CV) are: 0.8352664576802507


In [438]:
df_nfr_binary_tfidf

Unnamed: 0,dataset,wv_type,model_name,accuracy_score,precision_score,recall_score,f1_score
0,nfr_binary,tfidf,SVC,0.87234,0.872517,0.862057,0.866287
1,nfr_binary,tfidf,RandomForestClassifier,0.829787,0.853042,0.802153,0.813214
2,nfr_binary,tfidf,AdaBoostClassifier,0.803191,0.811198,0.77963,0.787558
