In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf

In [3]:
df = pd.read_csv('phishing_data_by_type.csv')

In [4]:
df['Subject'] = df['Subject'].fillna('')
df['Text'] = df['Text'].fillna('')

In [5]:
df['full_text'] = df['Subject'] + ' ' + df['Text']

def check_label(x):
    if x in ['Phishing', 'Fraud']:
        return 1
    else:
        return 0

df['label'] = df['Type'].apply(check_label)

In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['full_text'].tolist(),
    df['label'].tolist(),
    test_size=0.2
)

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

In [10]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

In [11]:
model.fit(train_dataset, epochs=3, validation_data=test_dataset)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f1804623b20>

In [12]:
predictions = model.predict(test_dataset)



In [13]:
y_pred = tf.argmax(predictions.logits, axis=1)

In [14]:
print('Accuracy:', accuracy_score(test_labels, y_pred))
print('Recall:', recall_score(test_labels, y_pred))
print('Precision:', precision_score(test_labels, y_pred))
print('F1 score:', f1_score(test_labels, y_pred))

Accuracy: 0.875
Recall: 0.75
Precision: 1.0
F1 score: 0.8571428571428571


In [15]:
def predict_fraud(text, model, tokenizer):
    # tokenize
    encoding = tokenizer([text], truncation=True, padding=True, max_length=128)
    
    # convert to tensorflow
    input_dataset = tf.data.Dataset.from_tensor_slices((
        dict(encoding)
    )).batch(1)
   
    prediction = model.predict(input_dataset)

    y_pred = tf.argmax(prediction.logits, axis=1)

    return "Fraud" if y_pred.numpy()[0] == 1 else "Not Fraud"

# Testing Function manually
text = """

"""
print(predict_fraud(text, model, tokenizer))

Not Fraud


In [16]:
model.save_pretrained('./my_model')

In [17]:
model = TFDistilBertForSequenceClassification.from_pretrained('./my_model')

Some layers from the model checkpoint at ./my_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./my_model and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
tokenizer.save_pretrained('./my_tokenizer')

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/vocab.txt',
 './my_tokenizer/added_tokens.json',
 './my_tokenizer/tokenizer.json')

In [19]:
tokenizer = DistilBertTokenizerFast.from_pretrained('./my_tokenizer')

In [20]:
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [30]:
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_tokenizer')

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/vocab.txt',
 './my_tokenizer/added_tokens.json',
 './my_tokenizer/tokenizer.json')

In [23]:
!huggingface-cli repo create phishNet --type model

[90mgit version 2.25.1[0m
[90mgit-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5)[0m

You are about to create [1mJagannath/phishNet[0m
Proceed? [Y/n] y

Your repo now lives at:
  [1mhttps://huggingface.co/Jagannath/phishNet[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/Jagannath/phishNet



In [25]:
!git clone https://huggingface.co/Jagannath/phishNet

Cloning into 'phishNet'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), 411 bytes | 411.00 KiB/s, done.


In [26]:
!cp -r ./my_model/* ./phishNet/
!cp -r ./my_tokenizer/* ./phishNet/

In [28]:
%cd phishNet

# Configure git with your email and username
!git config --global user.email "jagpra87@gmail.com"
!git config --global user.name "Jagannath"

# Add all files and commit the changes
!git add .
!git commit -m "Initial commit with phishing detection model"

# Push the changes
!git push

/content/phishNet
[main d9a47c0] Initial commit with phishing detection model
 6 files changed, 61252 insertions(+)
 create mode 100644 config.json
 create mode 100644 special_tokens_map.json
 create mode 100644 tf_model.h5
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 vocab.txt
fatal: could not read Username for 'https://huggingface.co': No such device or address


In [31]:
model.push_to_hub("Jagannath/phishNet")
tokenizer.push_to_hub("Jagannath/phishNet")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

tf_model.h5:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jagannath/phishNet/commit/cd565785e3cd60379c89d4a5f6125a85bcc7fc1b', commit_message='Upload tokenizer', commit_description='', oid='cd565785e3cd60379c89d4a5f6125a85bcc7fc1b', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
model.push_to_hub("Jagannath/phishNet")