# LogReg S-Uniward 0.3 vs Miranda Attributes

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os, random, time, MirandaAttributes, cv2
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

#Miranda dataset
train_dataset_url = "https://raw.githubusercontent.com/intentodemusico/StegianV2/master/DatasetMiranda/features_train_70000.csv"
test_dataset_url = "https://raw.githubusercontent.com/intentodemusico/StegianV2/master/DatasetMiranda/features_test_70000.csv"

TensorFlow version: 2.3.0
Eager execution: True


In [3]:
#%% Importing the dataset
MirandaLabels=['Kurtosis', 'Skewness', 'Std', 'Range', 'Median', 'Geometric_Mean', 'Mobility', 'Complexity']
trainDataset = pd.read_csv(train_dataset_url)
MirandaX_train = trainDataset.iloc[:, :-1].values
MirandaY_train = trainDataset.iloc[:, -1].values

testDataset = pd.read_csv(test_dataset_url)
MirandaX_test = testDataset.iloc[:, :-1].values
MirandaY_test = testDataset.iloc[:, -1].values

In [4]:
MirandaX_train

array([[1.502380e+01, 2.910000e+00, 2.747740e+02, ..., 1.604439e+02,
        1.477400e-01, 8.579000e-01],
       [2.801000e+00, 9.223200e-01, 2.357319e+02, ..., 1.453054e+02,
        1.200900e-01, 1.298700e+00],
       [3.988500e+00, 8.873200e-01, 1.745089e+02, ..., 1.826219e+02,
        2.003400e-01, 1.596500e+00],
       ...,
       [7.551800e+00, 2.032900e+00, 2.796423e+02, ..., 1.376493e+02,
        9.355000e-01, 1.760300e+00],
       [3.402200e+01, 4.804000e+00, 4.167042e+02, ..., 1.391423e+02,
        9.767900e-01, 1.521500e+00],
       [2.003410e+01, 3.686400e+00, 4.570151e+02, ..., 8.242910e+01,
        9.437100e-01, 1.696200e+00]])

In [5]:
#%% Feature Scaling
from sklearn.preprocessing import StandardScaler

In [6]:
MirandaLabels

['Kurtosis',
 'Skewness',
 'Std',
 'Range',
 'Median',
 'Geometric_Mean',
 'Mobility',
 'Complexity']

## Getting x and y

In [7]:
isStego=lambda x:1 if x[:6]=="stego_" else 0
getYFromImageList=lambda x: [isStego(element) for element in x]
folder="../../../Dataset/Mono/MonoSamples/S-Uniward_5" 
images=os.listdir(folder)
random.shuffle(images)
images=images
print(len(images))

40000


In [8]:
Garciay=getYFromImageList(images)
windowSize=128
GarciaLabels=['Kurtosis', 'Skewness', 'Std', 'Range', 'Median', 'Garcia_Gmean', 'Epsilon_Gmean', 'Mobility', 'Complexity']
t = time.time()

GarciaX=pd.DataFrame(([ MirandaAttributes.attributes( cv2.imread( os.path.abspath(os.path.join(folder, images[i])) 
                                        ) ,windowSize,True)  for i in range(len(images))] ),columns=GarciaLabels)
elapsed = time.time() - t
print(elapsed)
GarciaX_train, GarciaX_test, GarciaY_train, GarciaY_test = train_test_split(GarciaX, Garciay, test_size=0.2, 
                                                                            random_state=0)

58.70069861412048


### EDA

In [9]:
pd.DataFrame(GarciaY_train).describe()

Unnamed: 0,0
count,32000.0
mean,0.501719
std,0.500005
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [10]:
pd.DataFrame(GarciaY_train).value_counts()

1    16055
0    15945
dtype: int64

In [11]:
GarciaX_train

Unnamed: 0,Kurtosis,Skewness,Std,Range,Median,Garcia_Gmean,Epsilon_Gmean,Mobility,Complexity
21370,6.527872,0.678969,44.381516,333.0,72.0,42.105972,41.294365,0.426025,2.881165
2470,3.816413,1.440355,96.161285,347.0,6.0,34.411018,10.060097,0.116640,13.622136
13767,5.650376,1.632887,68.657555,306.0,47.5,41.688362,27.200480,0.207866,7.668610
13316,247.390869,15.601495,590.595886,9433.0,14.0,13.074867,9.008055,0.925549,1.061311
26374,28.047445,4.758531,153.528824,1140.0,16.5,21.499170,15.268115,0.280361,2.946554
...,...,...,...,...,...,...,...,...,...
20757,9.688196,2.183486,63.360821,412.0,51.0,36.563606,35.746410,0.248716,5.540549
32103,140.515915,10.444835,93.935944,1356.0,52.0,44.208027,45.243896,0.854912,1.171925
30403,58.621365,6.569932,172.944351,1907.0,7.0,31.849165,9.623127,0.790832,2.054909
21243,4.317540,1.401220,84.008652,359.0,23.5,60.635574,12.082829,0.144717,10.567527


In [12]:
GarciaX_train.describe()

Unnamed: 0,Kurtosis,Skewness,Std,Range,Median,Garcia_Gmean,Epsilon_Gmean,Mobility,Complexity
count,32000.0,32000.0,32000.0,32000.0,32000.0,32000.0,32000.0,32000.0,32000.0
mean,23.94464,2.936126,109.358551,857.559469,32.234141,53.161411,26.293143,0.324207,5.919156
std,48.732212,3.248193,98.174597,1542.599636,20.927298,424.742675,14.564544,0.217895,3.04295
min,1.238583,-1.234521,14.475194,84.0,0.0,1.857177,1.038634,0.085735,0.894159
25%,3.145552,1.021595,58.872959,250.0,14.0,27.462421,14.357998,0.184116,3.539029
50%,6.361457,1.884496,81.451889,392.0,32.5,37.12981,25.055899,0.252372,5.630137
75%,16.713444,3.422678,121.025984,749.0,49.5,46.183062,37.225324,0.375313,8.020814
max,254.003906,15.906097,1021.998047,16384.0,90.0,16384.0,63.118073,1.41976,16.138912


##### Dataset looks homogeneous

In [13]:
trainDataset

Unnamed: 0,Kurtosis,Skewness,Std,Range,Median,Geometric_Mean,Mobility,Complexity,Tag
0,15.0238,2.91000,274.7740,1789,228.0,160.4439,0.147740,0.8579,0
1,2.8010,0.92232,235.7319,917,179.0,145.3054,0.120090,1.2987,0
2,3.9885,0.88732,174.5089,817,260.0,182.6219,0.200340,1.5965,0
3,5.4531,1.83640,343.9752,1336,69.5,120.5608,0.083738,1.2973,0
4,15.8010,3.38050,443.4495,2785,118.0,113.9388,0.231590,1.2115,0
...,...,...,...,...,...,...,...,...,...
55994,5.0543,1.22740,172.0252,981,238.0,205.0903,0.945960,1.7109,1
55995,89.9023,8.30370,614.5032,7673,122.5,100.0387,0.988840,1.5964,1
55996,7.5518,2.03290,279.6423,1520,156.0,137.6493,0.935500,1.7603,1
55997,34.0220,4.80400,416.7042,3971,116.5,139.1423,0.976790,1.5215,1


In [14]:
trainDataset.iloc[:,:-1].describe()

Unnamed: 0,Kurtosis,Skewness,Std,Range,Median,Geometric_Mean,Mobility,Complexity
count,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0
mean,23.861412,3.041558,367.443301,2756.949963,148.46774,137.751112,0.448144,1.510563
std,43.134687,2.957414,249.044479,3567.278219,72.227963,69.236967,0.30113,0.333588
min,1.1379,-1.5404,59.8841,338.0,0.0,4.8267,0.048408,0.086309
25%,4.01075,1.23515,221.2457,1055.0,96.0,99.4586,0.18399,1.303
50%,7.9529,2.1234,299.0715,1646.0,156.0,138.097,0.38407,1.5777
75%,19.8208,3.64425,427.83185,2924.0,203.0,174.0896,0.723175,1.78025
max,253.9424,15.9032,3624.7304,58017.0,358.0,3488.3973,1.4081,1.9601


In [15]:
trainDataset.describe()

Unnamed: 0,Kurtosis,Skewness,Std,Range,Median,Geometric_Mean,Mobility,Complexity,Tag
count,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0,55999.0
mean,23.861412,3.041558,367.443301,2756.949963,148.46774,137.751112,0.448144,1.510563,0.500009
std,43.134687,2.957414,249.044479,3567.278219,72.227963,69.236967,0.30113,0.333588,0.500004
min,1.1379,-1.5404,59.8841,338.0,0.0,4.8267,0.048408,0.086309,0.0
25%,4.01075,1.23515,221.2457,1055.0,96.0,99.4586,0.18399,1.303,0.0
50%,7.9529,2.1234,299.0715,1646.0,156.0,138.097,0.38407,1.5777,1.0
75%,19.8208,3.64425,427.83185,2924.0,203.0,174.0896,0.723175,1.78025,1.0
max,253.9424,15.9032,3624.7304,58017.0,358.0,3488.3973,1.4081,1.9601,1.0


In [16]:
MirandaX_train

array([[1.502380e+01, 2.910000e+00, 2.747740e+02, ..., 1.604439e+02,
        1.477400e-01, 8.579000e-01],
       [2.801000e+00, 9.223200e-01, 2.357319e+02, ..., 1.453054e+02,
        1.200900e-01, 1.298700e+00],
       [3.988500e+00, 8.873200e-01, 1.745089e+02, ..., 1.826219e+02,
        2.003400e-01, 1.596500e+00],
       ...,
       [7.551800e+00, 2.032900e+00, 2.796423e+02, ..., 1.376493e+02,
        9.355000e-01, 1.760300e+00],
       [3.402200e+01, 4.804000e+00, 4.167042e+02, ..., 1.391423e+02,
        9.767900e-01, 1.521500e+00],
       [2.003410e+01, 3.686400e+00, 4.570151e+02, ..., 8.242910e+01,
        9.437100e-01, 1.696200e+00]])

#### Standard scalling

In [17]:
scG=StandardScaler()
sc_GarciaX_train=pd.DataFrame(scG.fit_transform(GarciaX_train),columns=GarciaLabels)
sc_GarciaX_test=pd.DataFrame(scG.transform(GarciaX_test),columns=GarciaLabels)

scM=StandardScaler()
sc_MirandaX_train=pd.DataFrame(scM.fit_transform(MirandaX_train),columns=MirandaLabels)
sc_MirandaX_test=pd.DataFrame(scM.transform(MirandaX_test),columns=MirandaLabels)

#### Recursive Feature Elimination

In [18]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logregRFEG = LogisticRegression()
rfe = RFE(logregRFEG, n_features_to_select=1)
rfe = rfe.fit(sc_GarciaX_train,GarciaY_train)
print(rfe.ranking_)
print("García's Dataset")
print("Ordered from most to less desired variables:")
desiredVariables=GarciaLabels
sortOrder=rfe.ranking_
sorted(desiredVariables,key=lambda pos: sortOrder[desiredVariables.index(pos)])

[5 4 1 2 8 6 7 3 9]
García's Dataset
Ordered from most to less desired variables:


['Std',
 'Range',
 'Mobility',
 'Skewness',
 'Kurtosis',
 'Garcia_Gmean',
 'Epsilon_Gmean',
 'Median',
 'Complexity']

In [19]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logregRFEM = LogisticRegression()
rfe = RFE(logregRFEM, n_features_to_select=1)
rfe = rfe.fit(sc_MirandaX_train,MirandaY_train)
print(rfe.ranking_)
print("Miranda's Dataset")
print("Ordered from most to less desired variables:")
desiredVariables=MirandaLabels
sortOrder=rfe.ranking_
sorted(desiredVariables,key=lambda pos: sortOrder[desiredVariables.index(pos)])

[4 3 5 6 7 8 2 1]
Miranda's Dataset
Ordered from most to less desired variables:


['Complexity',
 'Mobility',
 'Skewness',
 'Kurtosis',
 'Std',
 'Range',
 'Median',
 'Geometric_Mean']

RFE Shows prefferences for the variables to be predictors, meaning it to be ordered depending on de predictor-likability of a variable. Proving the anterior analysis, 

### Modelling

In [20]:
import statsmodels.api as sm
logit_model=sm.Logit(MirandaY_train,sc_MirandaX_train)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.264509
         Iterations 8
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.618     
Dependent Variable: y                AIC:              29640.4288
Date:               2021-09-07 08:23 BIC:              29711.8935
No. Observations:   55999            Log-Likelihood:   -14812.   
Df Model:           7                LL-Null:          -38816.   
Df Residuals:       55991            LLR p-value:      0.0000    
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     8.0000                                       
-----------------------------------------------------------------
                  Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-----------------------------------------------------------------
Kurtosis         -0.7686   0.0737 -10.4343 0.0000 -0.9129 -0.6242
Skewness          1.0767   0.0673  15.9897 0.0000  0.9447  1.2087


In [21]:
sc_GarciaX_train

Unnamed: 0,Kurtosis,Skewness,Std,Range,Median,Garcia_Gmean,Epsilon_Gmean,Mobility,Complexity
0,-0.357403,-0.694907,-0.661862,-0.340054,1.900221,-0.026029,1.029998,0.467288,-0.998386
1,-0.413044,-0.460500,-0.134429,-0.330979,-1.253604,-0.044146,-1.114577,-0.952620,2.531459
2,-0.375410,-0.401226,-0.414584,-0.357558,0.729483,-0.027012,0.062299,-0.533940,0.574930
3,4.585257,3.899265,4.901929,5.559171,-0.871323,-0.094380,-1.186811,2.759827,-1.596451
4,0.084192,0.561061,0.449923,0.183097,-0.751859,-0.074546,-0.756989,-0.201231,-0.976897
...,...,...,...,...,...,...,...,...,...
31995,-0.292551,-0.231714,-0.468537,-0.288841,0.896731,-0.039078,0.649070,-0.346464,-0.124423
31996,2.392116,2.311693,-0.157096,0.323122,0.944516,-0.021080,1.301177,2.435644,-1.560100
31997,0.711588,1.118733,0.647691,0.680317,-1.205819,-0.050178,-1.144579,2.141552,-1.269921
31998,-0.402760,-0.472549,-0.258216,-0.323199,-0.417363,0.017597,-0.975694,-0.823762,1.527611


In [22]:
import statsmodels.api as sm
logit_modelG=sm.Logit(GarciaY_train,sc_GarciaX_train)
result=logit_modelG.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.692882
         Iterations 4
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.000     
Dependent Variable: y                AIC:              44362.4316
Date:               2021-09-07 08:23 BIC:              44437.7931
No. Observations:   32000            Log-Likelihood:   -22172.   
Df Model:           8                LL-Null:          -22181.   
Df Residuals:       31991            LLR p-value:      0.034439  
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     4.0000                                       
------------------------------------------------------------------
                Coef.   Std.Err.     z     P>|z|    [0.025  0.975]
------------------------------------------------------------------
Kurtosis        0.0381    0.0748   0.5092  0.6106  -0.1085  0.1847
Skewness       -0.0670    0.0679  -0.9861  0.3241  -0.2000  0.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#X_train, X_test, y_train, y_test = train_test_split(sc.fit_transform(newX), y, test_size=0.2, random_state=0)
logregMiranda = LogisticRegression()
logregMiranda.fit(sc_MirandaX_train, MirandaY_train)
y_predMiranda = logregMiranda.predict(sc_MirandaX_test)
print("Miranda")
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logregMiranda.score(sc_MirandaX_test, MirandaY_test)))
print(metrics.f1_score(MirandaY_test,y_predMiranda))

Miranda
Accuracy of logistic regression classifier on test set: 0.90
0.9026283923356364


In [30]:
logregGarcia = LogisticRegression()
logregGarcia.fit(sc_GarciaX_train, GarciaY_train)
y_predGarcia = logregGarcia.predict(sc_GarciaX_test)
print("García")
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logregGarcia.score(sc_GarciaX_test, GarciaY_test)))
print(metrics.f1_score(GarciaY_test,y_predGarcia))

García
Accuracy of logistic regression classifier on test set: 0.52
0.4806659505907626


In [None]:
from itertools import chain, combinations
from sklearn.model_selection import cross_val_score
import copy
def best_subset_cv(estimator, X, Y, labels,cv=3):
    n_features = X.shape[1]
    subsets = chain.from_iterable([combinations(labels, k + 1) for k in range(n_features)])
    best_score = -np.inf
    best_subset = None
    for subset in subsets:
        estimatorCopy=copy.copy(estimator)
        score = cross_val_score(estimatorCopy, X.loc[:, list(subset)], Y, cv=cv).mean()
        if score > best_score:
            best_score, best_subset = score, subset

    return best_subset, best_score

In [None]:
logregBSM = LogisticRegression()
subM, scoM=best_subset_cv(logregBSM,sc_MirandaX_train,MirandaY_train,MirandaLabels)
print("Miranda")

In [None]:
subM, scoM

In [None]:
logregBSG = LogisticRegression()
subG, scoG=best_subset_cv(logregBSG,sc_GarciaX_train,GarciaY_train,GarciaLabels)
print("García")

In [None]:
subG, scoG

In [None]:
import seaborn as sns
sns.set_theme(style="dark")
corr = sc_MirandaX_train.corr()
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
ax.tick_params(axis='x', rotation=90)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}).set_title("Miranda's correlation matrix")
plt.show()

In [None]:
sns.set_theme(style="dark")
corr = sc_GarciaX_train.corr()
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
ax.tick_params(axis='x', rotation=90)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}).set_title("Garcías's correlation matrix")
plt.show()

In [31]:
logregGarcia.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}