In [29]:
# The below was heavily inspired by
# https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
# and Damien's previous work on developing a Random Forrest Classifier. 

import pandas as pd 
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [30]:
# Reading the data from the provided csv. 
rawdata1 = pd.read_csv(r"C:\Users\jft2456\Desktop\PrePracticum\wc.csv")

# Changing the rawdata type so that we receive true/false values in the reshaped array. 
# False values are known phishing URLs.  True values are known good values from Alexa Internet Rankings. 
rawdata1.type = (rawdata1.type == "alexa")

In [31]:
rawdata1.shape

(24232, 17)

In [32]:
rawdata1.head()

Unnamed: 0,domain,scrap_time,path,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,onchange_count,avg_onclick_count,avg_onload_count,avg_onchange_count,avg_cyc_complexity,library_code_count,type
0,0.2090000.ru,2017-11-17 11-19-03,C:\Phishing Project\phishtank - november 2017\...,True,2.0,0.0,4.0,64.0,1,1.0,0.0,358,3,1,0.0,0,False
1,000keca.wcomhost.com,2017-09-20 07-53-06,C:\Phishing Project\phishtank - september 2017...,True,2.0,2.0,2.0,169.0,1,1.0,0.0,219,1,1,1.0,1,False
2,000pichincha.webcindario.com,2017-11-21 03-59-08,C:\Phishing Project\phishtank - november 2017\...,True,4.0,0.0,3.0,260.5,1,0.0,1.0,1783,1,2,1.0,2,False
3,003.ru,2017-12-03 02-44-02,C:\target site - 20K alexa\content\15c\003.ru\...,True,75.0,0.0,18.5,955.733333,2,72.0,0.0,41327,4,15,0.733333,11,True
4,007ex.com,2017-12-04 06-09-04,C:\target site - 20K alexa\content\d10\007ex.c...,True,3.0,0.0,6.0,740.6,1,0.0,0.0,10901,1,5,0.8,4,True


In [33]:
# Reading the data from the provided csv. 
rawdata2 = pd.read_csv(r"C:\Users\jft2456\Desktop\PrePracticum\certificate_features_consolidated.csv")

# Changing the rawdata type so that we receive true/false values in the reshaped array. 
# False values are known phishing URLs.  True values are known good values from Alexa Internet Rankings. 
rawdata2.type = (rawdata2.type == "alexa")

In [34]:
rawdata2.shape

(8398, 2208)

In [35]:
rawdata2.head()

Unnamed: 0,domain,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,type,globalsign nv-sa,google inc,...,fiorettigioielli.it,wponabudget.com,reviews-by.ml,sewaknepal.org,watchvslivestreamtv.club,nehanaxane.ga,alexis-collins10.tk,laborindonesia.com,egdemo.info,theflatbellydrink.com
0,01100111011001010110010101101011.info,True,1098,True,False,305,False,True,0,0,...,0,0,0,0,0,0,0,0,0,0
1,012global.com,True,707,True,False,266,True,True,0,0,...,0,0,0,0,0,0,0,0,0,0
2,09017wsa.000webhostapp.com,True,1095,True,False,478,False,False,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0boticario.gq,True,190,True,False,2,True,False,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0day.kiev.ua,True,89,True,False,25,False,True,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
rawdata2 = rawdata2.iloc[:,:8]
rawdata2.head()

Unnamed: 0,domain,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,type
0,01100111011001010110010101101011.info,True,1098,True,False,305,False,True
1,012global.com,True,707,True,False,266,True,True
2,09017wsa.000webhostapp.com,True,1095,True,False,478,False,False
3,0boticario.gq,True,190,True,False,2,True,False
4,0day.kiev.ua,True,89,True,False,25,False,True


In [37]:
alldata = rawdata1.merge(rawdata2,how='inner',on='domain')
alldata.head()

Unnamed: 0,domain,scrap_time,path,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,...,avg_cyc_complexity,library_code_count,type_x,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,type_y
0,01100111011001010110010101101011.info,2017-11-20 02-21-37,C:\target site - 20K alexa\content\4e8\0110011...,True,2.0,0.0,10.0,242.4,1,0.0,...,0.6,3,True,True,1098,True,False,305,False,True
1,012global.com,2017-11-20 02-24-38,C:\target site - 20K alexa\content\bce\012glob...,True,5.0,0.0,21.0,313.666667,1,4.0,...,0.933333,14,True,True,707,True,False,266,True,True
2,09017wsa.000webhostapp.com,2017-09-23 04-34-05,C:\Phishing Project\phishtank - september 2017...,True,2.0,0.0,1.0,181.0,1,0.0,...,1.0,1,False,True,1095,True,False,478,False,False
3,0boticario.gq,2017-11-17 09-04-11,C:\Phishing Project\phishtank - november 2017\...,True,4.0,0.0,6.0,33.75,1,0.0,...,0.5,2,False,True,190,True,False,2,True,False
4,0day.kiev.ua,2017-11-20 07-37-20,C:\target site - 20K alexa\content\437\0day.ki...,True,7.0,0.0,5.0,152.0,1,1.0,...,0.5,2,True,True,89,True,False,25,False,True


In [38]:
# Why is the inner join soo much less than the number of certificate features???
alldata.shape

(3878, 24)

In [39]:
alldata = alldata.drop(columns="type_x")

In [40]:
alldata.head()

Unnamed: 0,domain,scrap_time,path,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,...,avg_onchange_count,avg_cyc_complexity,library_code_count,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,type_y
0,01100111011001010110010101101011.info,2017-11-20 02-21-37,C:\target site - 20K alexa\content\4e8\0110011...,True,2.0,0.0,10.0,242.4,1,0.0,...,5,0.6,3,True,1098,True,False,305,False,True
1,012global.com,2017-11-20 02-24-38,C:\target site - 20K alexa\content\bce\012glob...,True,5.0,0.0,21.0,313.666667,1,4.0,...,15,0.933333,14,True,707,True,False,266,True,True
2,09017wsa.000webhostapp.com,2017-09-23 04-34-05,C:\Phishing Project\phishtank - september 2017...,True,2.0,0.0,1.0,181.0,1,0.0,...,1,1.0,1,True,1095,True,False,478,False,False
3,0boticario.gq,2017-11-17 09-04-11,C:\Phishing Project\phishtank - november 2017\...,True,4.0,0.0,6.0,33.75,1,0.0,...,4,0.5,2,True,190,True,False,2,True,False
4,0day.kiev.ua,2017-11-20 07-37-20,C:\target site - 20K alexa\content\437\0day.ki...,True,7.0,0.0,5.0,152.0,1,1.0,...,4,0.5,2,True,89,True,False,25,False,True


In [41]:
alldata.shape

(3878, 23)

In [37]:
# Selecting the desired rows (excluding site, scrape_time, and path).
X = alldata.iloc[:,3:22].values  # Dependent variables. 
y = alldata.iloc[:,22:].values    # Response variable (aleax, or not phish, and phish). 

# Reshaping into a flat array to allow processing.
y.reshape((3878,))

array([ True,  True, False, ...,  True,  True,  True])

In [253]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [254]:
# Fitting and transforming the train and the test data to the min max scaler. 
# I chose the min max scalar because I want to maintain the distribution of the data. 
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()

scaler_minmax.fit(X_train)
scaler_minmax.fit(X_test)

X_train_MinMaxScaled = scaler_minmax.transform(X_train)
X_test_MinMaxScaled = scaler_minmax.transform(X_test)

In [255]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_MinMaxScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_MinMaxScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Remember, this is the accuracy for ALL available Code Complexity and Certificate features WITHOUT PCA. 

Accuracy: 91.95%


In [256]:
# Setting the PCA to a 99% variance.  Accounting for 95% variance only provided a roughly 66% fit. 
from sklearn.decomposition import PCA
pca = PCA(0.99)

In [257]:
pca.fit(X_train_MinMaxScaled)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [258]:
X_train_pca_MinMaxScaled = pca.transform(X_train_MinMaxScaled)
X_test_pca_MinMaxScaled = pca.transform(X_test_MinMaxScaled)

In [259]:
pca.n_components_

12

In [260]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_pca_MinMaxScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_pca_MinMaxScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Remember, this is the accuracy for ALL available Code Complexity and Certificate features WITH PCA. 

Accuracy: 89.84%


In [261]:
# Just testing this feature. 
pca.explained_variance_ratio_

array([0.37377405, 0.27712174, 0.10134865, 0.09233773, 0.07554605,
       0.01849   , 0.01319813, 0.0098642 , 0.00884437, 0.00808284,
       0.00611076, 0.0055601 ])

In [262]:
# Just testing this feature. 
df = pd.DataFrame(pca.components_)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-0.0,0.054329,-0.001497,0.018261,0.017362,0.023191,0.007265,0.004841,0.061456,0.018683,0.113381,0.478554,0.128519,1.2325950000000001e-32,-0.087974,0.006412,-0.020152,-0.054393,0.849788
1,-0.0,-0.054664,-0.003458,-0.014899,-0.031003,-0.030152,-0.003458,-0.002397,-0.073784,-0.018872,-0.094416,-0.819572,-0.114293,-0.0,-0.242974,-0.005532,-0.069471,-0.108488,0.468776
2,0.0,0.197314,0.003024,0.101198,-0.0036,0.089206,0.02824,0.011736,0.17675,0.167587,0.563065,-0.087734,0.431225,0.0,-0.530216,0.000878,0.014545,-0.229923,-0.194124
3,-0.0,0.099675,0.002901,0.057279,-0.004061,0.056867,0.031948,-0.001829,0.100916,0.099155,0.32599,-0.254149,0.204187,0.0,0.546461,0.004247,0.61675,0.229086,0.135769
4,0.0,0.08434,0.008198,0.047542,-0.010395,0.02826,0.035949,0.001489,0.072272,0.068105,0.275199,-0.140807,0.188354,-1.3234890000000002e-23,0.437273,0.000202,-0.783072,0.203623,0.039823
5,0.0,0.249838,0.054076,-0.010606,0.028396,-0.019133,0.950321,0.007976,0.091907,-0.013484,-0.107258,-0.000945,-0.09043,-2.168404e-19,-0.002035,0.005624,0.005023,-0.042491,-0.004563
6,0.0,0.073629,0.024945,-0.05747,0.273292,0.203336,-0.141364,0.006846,0.872562,-0.197623,-0.205631,-0.058864,-0.077028,0.0,-0.0204,-0.003895,-0.015198,0.042709,0.000375
7,0.0,-0.287088,0.005241,-0.077279,0.027523,-0.641834,0.084872,0.000347,0.092994,-0.615868,0.139275,-0.014979,0.28764,-3.469447e-18,0.009534,-0.016756,0.019605,-0.02246,-0.010443
8,-0.0,0.834154,-0.073488,-0.008121,-0.020724,-0.450104,-0.23753,0.020816,-0.043616,-0.002751,-0.179961,-0.015932,-0.002996,1.595946e-16,-0.011277,-0.026344,0.005358,0.058594,0.000895
9,-0.0,-0.264003,0.01467,0.108049,0.08845,-0.562961,0.014947,-0.012987,0.283486,0.67245,0.000214,0.019043,-0.166168,2.220446e-16,0.065441,0.048173,-0.006508,-0.161525,0.003096


In [263]:
# Repeating the above for the Code Complexity Features. 

# Selecting the desired rows (excluding site, scrape_time, and path) for rawdata1 (Code Complexity Features).
X = rawdata1.iloc[:,3:16].values  # Dependent variables. 
y = rawdata1.iloc[:,16:].values    # Response variable (aleax, or not phish, and phish). 

# Reshaping into a flat array to allow processing.
y.reshape((24232,))

array([False, False, False, ...,  True,  True,  True])

In [264]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [265]:
# Fitting and transforming the train and the test data to the min max scaler. 
# I chose the min max scalar because I want to maintain the distribution of the data. 
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()

scaler_minmax.fit(X_train)
scaler_minmax.fit(X_test)

X_train_MinMaxScaled = scaler_minmax.transform(X_train)
X_test_MinMaxScaled = scaler_minmax.transform(X_test)

In [266]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_MinMaxScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_MinMaxScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Remember, this is the accuracy for all available Code Complexity features. 

Accuracy: 87.50%


In [267]:
# Setting the PCA to a 99% variance.  Accounting for 95% variance only provided a roughly 66% fit. 
from sklearn.decomposition import PCA
pca = PCA(0.99)

In [268]:
pca.fit(X_train_MinMaxScaled)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [269]:
X_train_pca_MinMaxScaled = pca.transform(X_train_MinMaxScaled)
X_test_pca_MinMaxScaled = pca.transform(X_test_MinMaxScaled)

In [270]:
pca.n_components_

7

In [271]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_pca_MinMaxScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_pca_MinMaxScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Remember, this is the accuracy for all available Code Complexity features. 

Accuracy: 87.32%


In [272]:
# Just testing this feature. 
pca.explained_variance_ratio_

array([0.84821948, 0.09013461, 0.02447365, 0.01367003, 0.00666745,
       0.00498873, 0.0046049 ])

In [273]:
# Just testing this feature. 
df = pd.DataFrame(pca.components_)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,-0.013143,-0.000294,-0.062132,-0.066867,-0.01637,-0.000819,-0.005106,-0.061237,-0.024427,-0.043694,-0.988505,-0.08833
1,0.0,0.073815,0.004895,0.660033,-0.019914,0.07101,0.004883,0.011442,0.284292,0.33134,0.389792,-0.124865,0.441762
2,-0.0,0.010811,-0.001803,-0.194249,0.695237,0.032688,9.6e-05,-0.00346,0.649469,-0.220087,-0.003974,-0.074065,0.044299
3,-0.0,-0.018784,0.000972,0.243593,0.444129,-0.012366,0.002816,-0.013188,-0.1521,0.639004,-0.248231,0.004415,-0.499624
4,0.0,0.008396,0.008745,0.509693,0.372084,-0.274608,-0.001585,-0.001106,-0.424822,-0.57947,0.075978,-0.009781,-0.063856
5,0.0,0.013115,0.004048,-0.116143,0.280165,0.8176,0.014158,0.155265,-0.397596,-0.021447,0.135727,-0.024322,0.19364
6,-0.0,0.009012,0.010156,0.395874,-0.310121,0.480303,0.002387,-0.001159,0.357326,-0.310166,-0.191579,0.027134,-0.504879


In [207]:
# Repeating the above for the Certificate Features. 

# Selecting the desired rows (excluding site, scrape_time, and path) for rawdata2 (Certificate Features).
X = rawdata2.iloc[:,1:7].values  # Dependent variables. 
y = rawdata2.iloc[:,7:].values    # Response variable (aleax, or not phish, and phish). 

# Reshaping into a flat array to allow processing.
y.reshape((8398,))

array([ True,  True, False, ...,  True,  True,  True])

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [209]:
# Fitting and transforming the train and the test data to the min max scaler. 
# I chose the min max scalar because I want to maintain the distribution of the data. 
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()

scaler_minmax.fit(X_train)
scaler_minmax.fit(X_test)

X_train_MinMaxScaled = scaler_minmax.transform(X_train)
X_test_MinMaxScaled = scaler_minmax.transform(X_test)

In [210]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_MinMaxScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_MinMaxScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Remember, this is the accuracy for all available Certificate features. 

Accuracy: 92.24%


In [211]:
# Setting the PCA to a 99% variance.  Accounting for 95% variance only provided a roughly 66% fit. 
from sklearn.decomposition import PCA
pca = PCA(0.99)

In [212]:
pca.fit(X_train_MinMaxScaled)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [213]:
X_train_pca_MinMaxScaled = pca.transform(X_train_MinMaxScaled)
X_test_pca_MinMaxScaled = pca.transform(X_test_MinMaxScaled)

In [214]:
pca.n_components_

4

In [215]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_MinMaxScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_MinMaxScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Remember, this is the accuracy for all available Certificate features. 

Accuracy: 92.24%


In [216]:
# Now Moving on to the standard scalar. 

In [217]:
# Starting with all data (both certificate features and code complexity features).

# Selecting the desired rows (excluding site, scrape_time, and path).
X = alldata.iloc[:,3:22].values  # Dependent variables. 
y = alldata.iloc[:,22:].values   # Response variable (aleax, or not phish, and phish). 

# Reshaping into a flat array to allow processing.
y.reshape((3878,))

array([ True,  True, False, ...,  True,  True,  True])

In [218]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [219]:
# Using the standard scalar here for comparision. 
from sklearn.preprocessing import StandardScaler

scaler_standard = StandardScaler()

scaler_standard.fit(X_train)
scaler_standard.fit(X_test)

X_train_StandardScaled = scaler_standard.transform(X_train)
X_test_StandardScaled = scaler_standard.transform(X_test)

In [220]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_StandardScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_StandardScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 91.95%


In [221]:
pca.fit(X_train_StandardScaled)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [222]:
X_train_pca_StandardScaled = pca.transform(X_train_StandardScaled)
X_test_pca_StandardScaled = pca.transform(X_test_StandardScaled)

In [223]:
pca.n_components_

15

In [224]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_pca_StandardScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_pca_StandardScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 90.23%


In [225]:
# Repeating the above for the Code Complexity Features. 

# Selecting the desired rows (excluding site, scrape_time, and path) for rawdata1 (Code Complexity Features).
X = rawdata1.iloc[:,3:16].values  # Dependent variables. 
y = rawdata1.iloc[:,16:].values    # Response variable (aleax, or not phish, and phish). 

# Reshaping into a flat array to allow processing.
y.reshape((24232,))

array([False, False, False, ...,  True,  True,  True])

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [227]:
# Using the standard scalar here for comparision. 
from sklearn.preprocessing import StandardScaler

scaler_standard = StandardScaler()

scaler_standard.fit(X_train)
scaler_standard.fit(X_test)

X_train_StandardScaled = scaler_standard.transform(X_train)
X_test_StandardScaled = scaler_standard.transform(X_test)

In [228]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_StandardScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_StandardScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 87.50%


In [229]:
pca.fit(X_train_StandardScaled)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [230]:
X_train_pca_StandardScaled = pca.transform(X_train_StandardScaled)
X_test_pca_StandardScaled = pca.transform(X_test_StandardScaled)

In [231]:
pca.n_components_

11

In [232]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_pca_StandardScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_pca_StandardScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 87.48%


In [233]:
# Repeating the above for the Certificate Features. 

# Selecting the desired rows (excluding site, scrape_time, and path) for rawdata2 (Certificate Features).
X = rawdata2.iloc[:,1:7].values  # Dependent variables. 
y = rawdata2.iloc[:,7:].values    # Response variable (aleax, or not phish, and phish). 

# Reshaping into a flat array to allow processing.
y.reshape((8398,))

array([ True,  True, False, ...,  True,  True,  True])

In [234]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [235]:
# Using the standard scalar here for comparision. 
from sklearn.preprocessing import StandardScaler

scaler_standard = StandardScaler()

scaler_standard.fit(X_train)
scaler_standard.fit(X_test)

X_train_StandardScaled = scaler_standard.transform(X_train)
X_test_StandardScaled = scaler_standard.transform(X_test)

In [236]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_StandardScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_StandardScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 92.24%


In [237]:
pca.fit(X_train_StandardScaled)

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [238]:
X_train_pca_StandardScaled = pca.transform(X_train_StandardScaled)
X_test_pca_StandardScaled = pca.transform(X_test_StandardScaled)

In [239]:
pca.n_components_

5

In [240]:
# Fitting the model. 
model = XGBClassifier()
model.fit(X_train_pca_StandardScaled, y_train.ravel())

# Making predictions for the test data. 
y_pred = model.predict(X_test_pca_StandardScaled)
predictions = [round(value) for value in y_pred.ravel()]

# Evaluating predictions for the test data. 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.07%


In [42]:
# Moving to dominance analysis because I want to eliminate unnecessary features, not necessarily create new, 
# less interperable principal components. 
from dominance_analysis import Dominance_Datasets
from dominance_analysis import Dominance

In [43]:
alldata.head()

Unnamed: 0,domain,scrap_time,path,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,...,avg_onchange_count,avg_cyc_complexity,library_code_count,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,type_y
0,01100111011001010110010101101011.info,2017-11-20 02-21-37,C:\target site - 20K alexa\content\4e8\0110011...,True,2.0,0.0,10.0,242.4,1,0.0,...,5,0.6,3,True,1098,True,False,305,False,True
1,012global.com,2017-11-20 02-24-38,C:\target site - 20K alexa\content\bce\012glob...,True,5.0,0.0,21.0,313.666667,1,4.0,...,15,0.933333,14,True,707,True,False,266,True,True
2,09017wsa.000webhostapp.com,2017-09-23 04-34-05,C:\Phishing Project\phishtank - september 2017...,True,2.0,0.0,1.0,181.0,1,0.0,...,1,1.0,1,True,1095,True,False,478,False,False
3,0boticario.gq,2017-11-17 09-04-11,C:\Phishing Project\phishtank - november 2017\...,True,4.0,0.0,6.0,33.75,1,0.0,...,4,0.5,2,True,190,True,False,2,True,False
4,0day.kiev.ua,2017-11-20 07-37-20,C:\target site - 20K alexa\content\437\0day.ki...,True,7.0,0.0,5.0,152.0,1,1.0,...,4,0.5,2,True,89,True,False,25,False,True


In [44]:
alldata = alldata.rename(columns={"type_y":"Target"})

In [45]:
alldata.head()

Unnamed: 0,domain,scrap_time,path,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,...,avg_onchange_count,avg_cyc_complexity,library_code_count,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,Target
0,01100111011001010110010101101011.info,2017-11-20 02-21-37,C:\target site - 20K alexa\content\4e8\0110011...,True,2.0,0.0,10.0,242.4,1,0.0,...,5,0.6,3,True,1098,True,False,305,False,True
1,012global.com,2017-11-20 02-24-38,C:\target site - 20K alexa\content\bce\012glob...,True,5.0,0.0,21.0,313.666667,1,4.0,...,15,0.933333,14,True,707,True,False,266,True,True
2,09017wsa.000webhostapp.com,2017-09-23 04-34-05,C:\Phishing Project\phishtank - september 2017...,True,2.0,0.0,1.0,181.0,1,0.0,...,1,1.0,1,True,1095,True,False,478,False,False
3,0boticario.gq,2017-11-17 09-04-11,C:\Phishing Project\phishtank - november 2017\...,True,4.0,0.0,6.0,33.75,1,0.0,...,4,0.5,2,True,190,True,False,2,True,False
4,0day.kiev.ua,2017-11-20 07-37-20,C:\target site - 20K alexa\content\437\0day.ki...,True,7.0,0.0,5.0,152.0,1,1.0,...,4,0.5,2,True,89,True,False,25,False,True


In [46]:
alldata.drop(alldata.columns[[0,1,2]], axis=1, inplace=True)
alldata.head()

Unnamed: 0,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,onchange_count,avg_onclick_count,avg_onload_count,avg_onchange_count,avg_cyc_complexity,library_code_count,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,Target
0,True,2.0,0.0,10.0,242.4,1,0.0,0.0,4186,5,5,0.6,3,True,1098,True,False,305,False,True
1,True,5.0,0.0,21.0,313.666667,1,4.0,0.0,11141,3,15,0.933333,14,True,707,True,False,266,True,True
2,True,2.0,0.0,1.0,181.0,1,0.0,0.0,426,0,1,1.0,1,True,1095,True,False,478,False,False
3,True,4.0,0.0,6.0,33.75,1,0.0,0.0,546,1,4,0.5,2,True,190,True,False,2,True,False
4,True,7.0,0.0,5.0,152.0,1,1.0,0.0,2085,1,4,0.5,2,True,89,True,False,25,False,True


In [47]:
alldata[['has_html', 'has_cert', 'valid_cert', 'extended_validation', 'multi_mtn']] = (alldata[['has_html', 'has_cert', 'valid_cert', 'extended_validation', 'multi_mtn']] == 'TRUE').astype(float)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [48]:
alldata.head()

Unnamed: 0,has_html,inline_count,avg_inline_script_block,external_count,avg_external_script_block,onclick_count,onload_count,onchange_count,avg_onclick_count,avg_onload_count,avg_onchange_count,avg_cyc_complexity,library_code_count,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,Target
0,0.0,2.0,0.0,10.0,242.4,1,0.0,0.0,4186,5,5,0.6,3,0.0,1098,0.0,0.0,305,0.0,True
1,0.0,5.0,0.0,21.0,313.666667,1,4.0,0.0,11141,3,15,0.933333,14,0.0,707,0.0,0.0,266,0.0,True
2,0.0,2.0,0.0,1.0,181.0,1,0.0,0.0,426,0,1,1.0,1,0.0,1095,0.0,0.0,478,0.0,False
3,0.0,4.0,0.0,6.0,33.75,1,0.0,0.0,546,1,4,0.5,2,0.0,190,0.0,0.0,2,0.0,False
4,0.0,7.0,0.0,5.0,152.0,1,1.0,0.0,2085,1,4,0.5,2,0.0,89,0.0,0.0,25,0.0,True


In [49]:
# objective=0 indicates that this is a classification, not a regression problem. 
# I am using the default pseudo R-Squared method, mcfadden. 
dominance_regression=Dominance(data=alldata,target='Target')

Selecting 15 Best Predictors for the Model



invalid value encountered in true_divide


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in less_equal



Selected Predictors :  ['inline_count', 'avg_inline_script_block', 'external_count', 'avg_external_script_block', 'onclick_count', 'onload_count', 'onchange_count', 'avg_onclick_count', 'avg_onload_count', 'avg_onchange_count', 'avg_cyc_complexity', 'library_code_count', 'longetivity', 'cert_age', 'multi_mtn']

********************  R-Squared of Complete Model :  ********************
R Squared : 0.3243261840057308



In [50]:
incr_variable_rsquare=dominance_regression.incremental_rsquare()

Selecting 15 Best Predictors for the Model



invalid value encountered in true_divide


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in less_equal



Selected Predictors :  ['inline_count', 'avg_inline_script_block', 'external_count', 'avg_external_script_block', 'onclick_count', 'onload_count', 'onchange_count', 'avg_onclick_count', 'avg_onload_count', 'avg_onchange_count', 'avg_cyc_complexity', 'library_code_count', 'longetivity', 'cert_age', 'multi_mtn']

Creating models for 32767 possible combinations of 15 features :



invalid value encountered in true_divide


invalid value encountered in greater


invalid value encountered in less


invalid value encountered in less_equal


  0%|                                                                                           | 0/15 [00:00<?, ?it/s]
 13%|███████████                                                                        | 2/15 [00:00<00:02,  4.52it/s]
 20%|████████████████▌                                                                  | 3/15 [00:02<00:07,  1.56it/s]
 27%|██████████████████████▏                                                            | 4/15 [00:07<00:21,  1.94s/it]
 33%|███████████████████████████▋                                                       | 5/15 [00:21<00:56,  5.67s/it]
 40%|█████████████████████████████████▏                                                 | 6/15 [00:46<01:43, 11.46s/it]
 47%|██████████████████████████████████████▋                                            | 7/15 [01:16<02:16, 17.04s/it]

#########################  Model Training Done!!!!!  #########################

#########################  Calculating Variable Dominances  #########################



  0%|                                                                                           | 0/14 [00:00<?, ?it/s]
 21%|█████████████████▊                                                                 | 3/14 [00:00<00:00, 16.93it/s]
 29%|███████████████████████▋                                                           | 4/14 [00:01<00:04,  2.12it/s]
 36%|█████████████████████████████▋                                                     | 5/14 [00:10<00:26,  2.92s/it]
 43%|███████████████████████████████████▌                                               | 6/14 [00:41<01:30, 11.30s/it]
 50%|█████████████████████████████████████████▌                                         | 7/14 [01:45<03:10, 27.28s/it]
 57%|███████████████████████████████████████████████▍                                   | 8/14 [03:06<04:19, 43.28s/it]
 64%|█████████████████████████████████████████████████████▎                             | 9/14 [04:06<04:01, 48.35s/it]
 71%|██████████████████████████████████

#########################  Variable Dominance Calculation Done!!!!!  #########################



In [51]:
dominance_regression.plot_incremental_rsquare()

In [52]:
dominance_regression.dominance_stats()

Unnamed: 0,Interactional Dominance,Individual Dominance,Average Partial Dominance,Total Dominance,Percentage Relative Importance
avg_cyc_complexity,0.0512634,0.1523,0.07009894,0.0743233,22.9162
avg_onchange_count,0.0141034,0.19049,0.04095888,0.0491373,15.1506
library_code_count,0.00302382,0.184212,0.03061139,0.0390123,12.0287
avg_onclick_count,0.00124355,0.14827,0.02671594,0.0331214,10.2124
external_count,5.17818e-05,0.15258,0.02502357,0.0318626,9.82424
avg_external_script_block,0.00581249,0.0607071,0.02376183,0.0250282,7.71699
inline_count,0.00129726,0.11895,0.01799319,0.0236106,7.27989
avg_onload_count,0.00467578,0.0925477,0.01813986,0.0222028,6.84582
onclick_count,0.00494993,0.0663576,0.01446456,0.0172898,5.33099
onchange_count,0.00161116,0.0123007,0.002958113,0.00349116,1.07643


In [53]:
dominance_regression.dominance_level()

Unnamed: 0,Predictors,Generally Dominating,Conditionally Dominating,Completely Dominating
0,avg_cyc_complexity,"[avg_onchange_count, library_code_count, avg_o...",,
1,avg_onchange_count,"[library_code_count, avg_onclick_count, extern...",,
2,library_code_count,"[avg_onclick_count, external_count, avg_extern...",,
3,avg_onclick_count,"[external_count, avg_external_script_block, in...",,
4,external_count,"[avg_external_script_block, inline_count, avg_...",,
5,avg_external_script_block,"[inline_count, avg_onload_count, onclick_count...",,
6,inline_count,"[avg_onload_count, onclick_count, onchange_cou...",,
7,avg_onload_count,"[onclick_count, onchange_count, onload_count, ...",,
8,onclick_count,"[onchange_count, onload_count, avg_inline_scri...","[onchange_count, onload_count, avg_inline_scri...","[onchange_count, onload_count, avg_inline_scri..."
9,onchange_count,"[onload_count, avg_inline_script_block, longet...",,


In [28]:
# The below is the example given in the documentation.  This was done to test the package. 

# This is for a CONTINUOUS response variable.  A BINARY response variable example is given later. 

In [29]:
from dominance_analysis import Dominance_Datasets
from dominance_analysis import Dominance

In [70]:
boston_dataset=Dominance_Datasets.get_boston()
boston_dataset.head()

The copy of Boston Housing Dataset is downloaded from: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html
Internally using load_boston function from sklearn.datasets 


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,House_Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [56]:
dominance_regression=Dominance(data=boston_dataset,target='House_Price',objective=1)

The copy of Boston Housing Dataset is downloaded from: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html
Internally using load_boston function from sklearn.datasets 
Selecting 13 Best Predictors for the Model
Selected Predictors :  ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

********************  R-Squared of Complete Model :  ********************
R Squared : 0.7406426641094094



In [57]:
incr_variable_rsquare=dominance_regression.incremental_rsquare()

Selecting 13 Best Predictors for the Model
Selected Predictors :  ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

Creating models for 8191 possible combinations of 13 features :


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:27<00:00,  2.15s/it]


#########################  Model Training Done!!!!!  #########################

#########################  Calculating Variable Dominances  #########################


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:21<00:00,  1.78s/it]


#########################  Variable Dominance Calculation Done!!!!!  #########################



In [None]:
# Note with the above that the dominance calculations are not displayed, as with the example. Odd. 

In [58]:
dominance_regression.plot_incremental_rsquare()

In [59]:
dominance_regression.dominance_stats()

Unnamed: 0,Interactional Dominance,Individual Dominance,Average Partial Dominance,Total Dominance,Percentage Relative Importance
LSTAT,0.0564384,0.544146,0.182193,0.200362,27.0525
RM,0.0438082,0.483525,0.173161,0.187085,25.2599
PTRATIO,0.0279573,0.257847,0.066821,0.0785255,10.6023
INDUS,5.89159e-05,0.23399,0.023374,0.0377819,5.10123
TAX,0.00567131,0.219526,0.023006,0.0367894,4.96723
NOX,0.0114044,0.182603,0.021853,0.0334146,4.51157
DIS,0.0288511,0.0624644,0.027481,0.0302771,4.08795
CRIM,0.00569384,0.15078,0.017892,0.0271756,3.6692
ZN,0.00602798,0.129921,0.016963,0.0248109,3.34991
RAD,0.0112171,0.145639,0.013666,0.0236294,3.19039


In [60]:
dominance_regression.dominance_level()

Unnamed: 0,Predictors,Generally Dominating,Conditionally Dominating,Completely Dominating
0,LSTAT,"[RM, PTRATIO, INDUS, TAX, NOX, DIS, CRIM, ZN, ...","[RM, PTRATIO, INDUS, TAX, NOX, DIS, CRIM, ZN, ...","[RM, PTRATIO, INDUS, TAX, NOX, DIS, CRIM, ZN, ..."
1,RM,"[PTRATIO, INDUS, TAX, NOX, DIS, CRIM, ZN, RAD,...","[PTRATIO, INDUS, TAX, NOX, DIS, CRIM, ZN, RAD,...","[PTRATIO, INDUS, TAX, NOX, DIS, CRIM, ZN, RAD,..."
2,PTRATIO,"[INDUS, TAX, NOX, DIS, CRIM, ZN, RAD, B, AGE, ...",,
3,INDUS,"[TAX, NOX, DIS, CRIM, ZN, RAD, B, AGE, CHAS]",,
4,TAX,"[NOX, DIS, CRIM, ZN, RAD, B, AGE, CHAS]",,
5,NOX,"[DIS, CRIM, ZN, RAD, B, AGE, CHAS]",,
6,DIS,"[CRIM, ZN, RAD, B, AGE, CHAS]",,
7,CRIM,"[ZN, RAD, B, AGE, CHAS]",,
8,ZN,"[RAD, B, AGE, CHAS]",,
9,RAD,"[B, AGE, CHAS]",,


In [None]:
# The below is the example given in the documentation.  This was done to test the package. 

# This is for a BINARY response variable.  A CONTINUOUS response variable example was given above. 

In [54]:
from dominance_analysis import Dominance_Datasets
from dominance_analysis import Dominance
breast_cancer_data=Dominance_Datasets.get_breast_cancer()
dominance_classification=Dominance(data=breast_cancer_data,target='target',objective=0,pseudo_r2="mcfadden")

The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is downloaded from: https://goo.gl/U2Uwz2
Internally using load_breast_cancer function from sklearn.datasets 
Selecting 15 Best Predictors for the Model
Selected Predictors :  ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean concavity', 'radius error', 'perimeter error', 'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst compactness', 'worst concavity', 'worst concave points']

********************  Pseudo R-Squared of Complete Model :  ********************

MacFadden's R-Squared : 0.903143004700658 



In [56]:
# Note that this appears to contain both standardized and unstandardized variables. 
breast_cancer_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,intercept
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0,1
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0,1
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0,1
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0,1
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0,1
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0,1
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0,1
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0,1


In [66]:
incr_variable_rsquare=dominance_classification.incremental_rsquare()

Selecting 15 Best Predictors for the Model
Selected Predictors :  ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean concavity', 'radius error', 'perimeter error', 'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst compactness', 'worst concavity', 'worst concave points']

Creating models for 32767 possible combinations of 15 features :


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [04:46<00:00, 19.10s/it]


#########################  Model Training Done!!!!!  #########################

#########################  Calculating Variable Dominances  #########################


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [04:38<00:00, 19.89s/it]


#########################  Variable Dominance Calculation Done!!!!!  #########################



In [67]:
dominance_classification.plot_incremental_rsquare()

In [68]:
dominance_classification.dominance_stats()

Unnamed: 0,Interactional Dominance,Individual Dominance,Average Partial Dominance,Total Dominance,Percentage Relative Importance
worst concave points,0.0312533,0.666706,0.069722,0.106957,11.8427
worst perimeter,4.57937e-05,0.721229,0.045492,0.0875114,9.68965
worst area,0.000118113,0.69307,0.047056,0.0869941,9.63238
worst radius,0.000759372,0.695107,0.046161,0.0863974,9.5663
mean perimeter,4.13742e-05,0.594799,0.029577,0.0652897,7.22916
mean radius,0.000524557,0.560829,0.029433,0.062932,6.96811
mean area,0.000120274,0.566623,0.027674,0.061767,6.83911
area error,0.000414825,0.521582,0.027274,0.058437,6.4704
mean concavity,0.000328419,0.49001,0.023343,0.0529198,5.85951
worst concavity,0.00107721,0.417522,0.024757,0.0493627,5.46565


In [69]:
dominance_classification.dominance_level()

Unnamed: 0,Predictors,Generally Dominating,Conditionally Dominating,Completely Dominating
0,worst concave points,"[worst perimeter, worst area, worst radius, me...",,
1,worst perimeter,"[worst area, worst radius, mean perimeter, mea...",,
2,worst area,"[worst radius, mean perimeter, mean radius, me...",,
3,worst radius,"[mean perimeter, mean radius, mean area, area ...",,
4,mean perimeter,"[mean radius, mean area, area error, mean conc...",,
5,mean radius,"[mean area, area error, mean concavity, worst ...",,
6,mean area,"[area error, mean concavity, worst concavity, ...",,
7,area error,"[mean concavity, worst concavity, worst textur...",,
8,mean concavity,"[worst concavity, worst texture, worst compact...",,
9,worst concavity,"[worst texture, worst compactness, radius erro...",,
