In [7]:
#base packages
import math, os, scipy, h5py
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from scipy import ndimage
import pandas as pd
from dotenv import load_dotenv 
from tldextract import extract


#tensorflow
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from keras import preprocessing
import tensorflow_datasets as tfds
import tensorflow_text as text

import matplotlib.pyplot as plt
%matplotlib inline
  
    


In [8]:
#!pip install tensorflow_datasets
#!pip install tensorflow-text
#!pip install tldextract

### global configurations

In [9]:
tf.executing_eagerly()

EMBED_SIZE=20

### load data from database

In [10]:
load_dotenv() 
usrname = os.getenv('MYSQL_USER')
passwd = os.getenv('MYSQL_PASSWORD')
dbname = os.getenv('MYSQL_DB')

connectstring = "mysql://"+usrname+ ":"+passwd+"@localhost/"+dbname+"?charset=utf8"

from sqlalchemy import create_engine
dbengine = create_engine(connectstring,encoding="utf8")


In [11]:
#Load #phishnet database
#use the DB/ This is only for you.

def load_url_data():
  
    
    myQuery = '''select tmp1.url,if(click.clicked_dt is null, tmp1.y, 2) as y  from 
                (SELECT url, 1 as y FROM ternary_fund.redditdata
                 union select url, 1 as y from RecNet.reddit_urls
                 union select url, 0 as y from RecNet.phishing_data) as tmp1

                left join RecNet.click_data as click on
                MD5(tmp1.url) = MD5(click.url)
                
                order by rand();
                '''
    df = pd.read_sql_query(myQuery, dbengine)
    
    return df

#Pandas dataframe
df = load_url_data()
print("Number of observations " +str(len(df.index)))
grouped_df=df.groupby(['y'])['y']

print(grouped_df.describe())
    
df = df.drop_duplicates()
y_data = df['y'].to_numpy()


Number of observations 17522
     count  mean  std  min  25%  50%  75%  max
y                                             
0  15963.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
1   1448.0   1.0  0.0  1.0  1.0  1.0  1.0  1.0
2    111.0   2.0  0.0  2.0  2.0  2.0  2.0  2.0


### Tokenize the URL


In [12]:
#URL tokenizer
#INPUT  : A utf-8 encoded URL String
#OUTOUT : A dict of tokens

def getTokens(urlinput):
    tokensBySlash=str(urlinput.encode('utf-8')).split('/')
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        tokensByDot=[]
        for j in range(0,len(tokens)):
            tempTokens = str(tokens[j]).split('.')
            tokensByDot = tokensByDot + tempTokens
        allTokens=allTokens+tokens+tokensByDot
    allTokens = list(set(allTokens))
    
    if 'com' in allTokens:
        allTokens.remove('com')
        
    return allTokens

In [13]:
def getTokens_into_characters(urlinput):
    tokensBySlash=' '.join(str(urlinput.encode('utf-8')).strip("'").strip("b'")).split('/')
    #print(tokensBySlash)
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        #print("*** splitting by dash ***")
        #print(tokens)
        tokensByDot=[]
        for j in range(0,len(tokens)):
            #print(str(tokens[j]).split('.'))
            tokensByDot += str(tokens[j]).split('.')
            #print("*** splitting by dot ***")
            #print(tokensByDot)
        allTokens+=tokensByDot
    allTokens = ' '.join(list(allTokens)).split() 
    
    #print(allTokens)
    #if 'com' in allTokens:
    #    allTokens.remove('com')
       
    return allTokens

In [14]:
def getTokens_into_words(urlinput):
    tokensBySlash=str(urlinput.encode('utf-8')).strip("'").strip("b'").split('/')
    #print(tokensBySlash)
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        #print("*** splitting by dash ***")
        #print(tokens)
        tokensByDot=[]
        for j in range(0,len(tokens)):
            #print(str(tokens[j]).split('.'))
            tokensByDot += str(tokens[j]).split('.')
            #print("*** splitting by dot ***")
            #print(tokensByDot)
        allTokens+=tokensByDot
    allTokens = ' '.join(list(allTokens)).split() 
    
    #print(allTokens)
    #if 'com' in allTokens:
    #    allTokens.remove('com')
       
    return allTokens

In [15]:
def getURLInfo(url):
    #some idea to extract only once
    val = extract(url)
    a = val.domain
    b = val.suffix
    
    return a,b

In [16]:
tokenized_url_word = df.apply(lambda row:getTokens_into_words(row['url']), axis=1)
df.insert(len(df.columns), 'tokenized_url_word',tokenized_url_word )

tokenized_url_char= df.apply(lambda row:getTokens_into_characters(row['url']), axis=1)
df.insert(len(df.columns), 'tokenized_url_char',tokenized_url_char )

tld= df.apply(lambda row:extract(row['url']).suffix, axis=1)
df.insert(len(df.columns), 'tld',tld )

domain = df.apply(lambda row:extract(row['url']).domain, axis=1)
df.insert(len(df.columns), 'domain',domain )



In [17]:
#This is for testing the tokenization effort  ; commented out since it works
#df[['tokenized_url_word','url','domain','tld']].to_csv('tokenized-result.csv')   
#what does that really measure?
#val= df.tokenized_url_word.map(len).max()
#print("Longest vector with words " + str(val))

#focus embeddings!

### Encode the URL from text sequences to integers per vocabulary

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer

tf_keras_tokenizer = Tokenizer()
tf_keras_tokenizer.fit_on_texts(df['tokenized_url_word'])
tf_keras_encoded = tf_keras_tokenizer.texts_to_sequences(df['tokenized_url_word'])
tf_keras_encoded = preprocessing.sequence.pad_sequences(tf_keras_encoded, padding="post", maxlen=EMBED_SIZE) 
print(tf_keras_encoded.shape)

print("*** sample check ***")
print(df['tokenized_url_word'][1])
print(tf_keras_encoded[1] )

print(df['tokenized_url_word'][2])
print(tf_keras_encoded[2] )

(17515, 20)
*** sample check ***
['http:', '92', '119', '115', '94', 'xmlrpc', 'php']
[   3 1136 1363 2443  843 8674    7    0    0    0    0    0    0    0
    0    0    0    0    0    0]
['http:', 'rangelferreira', 'adv', 'br']
[   3 8675 2444   35    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]


In [19]:
tf_keras_tokenizer.index_word[5000]  

'betasus6'

In [20]:
#Apply the Q-learning approach
#We have now a tokenized representation of the url
#Next step token to integer
#then integer to sense vector (the embedding)

In [21]:
#print(max(encoded_urls))
print(max(tf_keras_tokenizer.index_word ) )



27348


### Embedding layers
An embedding is a dense vector of floating point values (the length of the vector is a parameter you specify). Instead of specifying the values for the embedding manually, they are trainable parameters 
(weights learned by the model during training, in the same way a model learns weights for a dense layer). 
It is common to see word embeddings that are 8-dimensional (for small datasets),
up to 1024-dimensions when working with large datasets. A higher dimensional embedding can capture fine-grained
relationships between words, but takes more data to learn.

The Embedding layer can be understood as a lookup table that maps from integer indices 
(which stand for specific words) to dense vectors (their embeddings).
The dimensionality (or width) of the embedding is a parameter you can experiment with to see what works well for your problem, much in the same way you would experiment with the number of neurons in a Dense layer.


In [23]:

VOCAB_LEN = max(tf_keras_tokenizer.index_word ) +1
print(VOCAB_LEN)

embedding_layer = layers.Embedding(VOCAB_LEN, EMBED_SIZE)

result = embedding_layer(tf_keras_encoded)
#result.numpy()

#Voila, now I have a list of tokenized, vocabbed, and embedded words. 
#Happiness arises

print(result.shape)
print(type(result))
print(result[5000]) #should be a EMBED_SIZE x EMBED_SIZE expression of 'revolut'

27349
(17515, 20, 20)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[ 0.0235973   0.00458556  0.03265947  0.02491995  0.01176801  0.03566331
   0.02945307  0.04119417  0.01167879 -0.00541974 -0.01030164  0.04945549
   0.03076721 -0.01643503  0.04844477 -0.04776831 -0.04961973 -0.01994123
  -0.04586731  0.03668361]
 [-0.04161888  0.00960376  0.04958213  0.01719692  0.04383865 -0.03821521
   0.00884803  0.01316024  0.00709238 -0.04462513  0.0085342  -0.02788415
  -0.0020823   0.02606661 -0.01360477  0.02249715  0.03385003  0.04130704
   0.04059443  0.01198201]
 [ 0.04853312  0.04253754  0.04314449 -0.01307096  0.03372278 -0.04000518
   0.0305846   0.03284589  0.01849871 -0.04812187  0.00838912  0.0123531
  -0.02486642 -0.01130117  0.00907809  0.01639544 -0.00414705  0.00522097
   0.01436586 -0.0228098 ]
 [-0.01335535 -0.04597292 -0.01674479  0.00064097  0.02425278 -0.04278617
   0.0218132  -0.03670419  0.04429739  0.02804651 -0.03418125 -0.01292949
   0.04862865 -0.01

When you create an Embedding layer, the weights for the embedding are randomly initialized (just like any other layer). During training, they are gradually adjusted via backpropagation. Once trained, the learned word embeddings will roughly encode similarities between words (as they were learned for the specific problem your model is trained on).

If you pass an integer to an embedding layer, the result replaces each integer with the vector from the embedding table:

or text or sequence problems, the Embedding layer takes a 2D tensor of integers, of shape (samples, sequence_length), where each entry is a sequence of integers. It can embed sequences of variable lengths. You could feed into the embedding layer above batches with shapes (32, 10) (batch of 32 sequences of length 10) or (64, 15) (batch of 64 sequences of length 15).

The returned tensor has one more axis than the input, the embedding vectors are aligned along the new last axis. Pass it a (2, 3) input batch and the output is (2, 3, N)

In [24]:
#Split the generates tf_keras_encoded dataset (result) into training and testing
#DATASET_SIZE = result.shape[0]


dataset = tf.data.Dataset.from_tensor_slices((result, y_data))
dataset.shuffle(buffer_size=1024).batch(32)

result2 = tf.reshape(result, (-1, EMBED_SIZE, EMBED_SIZE,1))
y_data2 = tf.reshape(y_data, (-1, 1))

print(result.shape)
print(result2.shape)
print(y_data.shape)
print(y_data2.shape)

print(type(result))
print(type(result2))

print(y_data2)

(17515, 20, 20)
(17515, 20, 20, 1)
(17515,)
(17515, 1)
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[0]
 [0]
 [0]
 ...
 [1]
 [0]
 [1]], shape=(17515, 1), dtype=int64)


In [25]:
#hardcode the validation sample at 3000
#not good practice, but hey, here we are
x_val = result2[-3000:]
y_val = y_data2[-3000:]
x_train = result2[:-3000]
y_train = y_data2[:-3000]

print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(14515, 20, 20, 1)
(14515, 1)
(3000, 20, 20, 1)
(3000, 1)


### Word based model

In [26]:
#256 h-length convolutional filters 3,4,5,6 - for WORD level
wordmodel = models.Sequential()
wordmodel.add(layers.Conv2D(filters=256,kernel_size=(3, 3), strides= (1, 1), padding='same', activation='relu', input_shape=(EMBED_SIZE, EMBED_SIZE, 1)))
wordmodel.add(layers.MaxPooling2D((2, 2)))
wordmodel.add(layers.Dense(512, activation='relu'))


In [27]:
wordmodel.add(layers.Flatten())
wordmodel.add(layers.Dense(512, activation='relu'))
wordmodel.add(layers.Dense(256, activation='relu'))
wordmodel.add(layers.Dense(128, activation='relu'))
wordmodel.add(layers.Dense(3))

In [28]:
wordmodel.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),metrics=['accuracy'])

#wordmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [29]:
wordmodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 20, 256)       2560      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 10, 10, 256)       0         
_________________________________________________________________
dense (Dense)                (None, 10, 10, 512)       131584    
_________________________________________________________________
flatten (Flatten)            (None, 51200)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               26214912  
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               3

In [None]:
#Reminder change the epoch if needed it was 10
#Why you need the epoch anyways...

history = wordmodel.fit(x_train, y_train, epochs=2, validation_split=0.1, verbose=1)



Epoch 1/2

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = wordmodel.evaluate(x_val,y_val, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = wordmodel.predict(x_val[:3])
print("predictions shape:", predictions.shape)

print(predictions)

### Compiling and fitting the models

In [None]:
wordmodel.metrics_names

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['loss'], label = 'loss')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

test_loss, test_acc = wordmodel.evaluate(result2,  y_data2, verbose=2)


In [None]:
print(test_acc)
