In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
torch.cuda.is_available()

True

In [5]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


2.1.1+cu118
11.8
True


In [6]:
df = pd.read_csv("./text_only.csv",encoding='ISO-8859-1')
df.head()

Unnamed: 0.1,Unnamed: 0,clean_text,spam
0,0,Dumb,0
1,1,As u should ser,0
2,2,"Congratulations, you just became exit liquidity",0
3,3,"Been in the space for years, have never used ...",0
4,4,Me to,0


In [7]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,clean_text,spam
0,Dumb,0
1,As u should ser,0
2,"Congratulations, you just became exit liquidity",0
3,"Been in the space for years, have never used ...",0
4,Me to,0


In [8]:
df['spam'].value_counts()

spam
0    629
1    151
Name: count, dtype: int64

In [9]:
df['clean_text'].isna().sum()

20

In [10]:
df = df.dropna()
df.isna().sum()

clean_text    0
spam          0
dtype: int64

In [11]:
len(df)

760

In [12]:
df = df[df['clean_text'] != 'N/A']
len(df)

760

In [13]:
train = df.sample(frac = .8, random_state =42)
valid = df.drop(train.index)

len(train), len(valid)

(608, 152)

In [14]:
train.spam.value_counts()

spam
0    489
1    119
Name: count, dtype: int64

In [15]:
valid.spam.value_counts()

spam
0    120
1     32
Name: count, dtype: int64

In [16]:
train_ds = Dataset.from_pandas(train)
valid_ds = Dataset.from_pandas(train)

train_ds, valid_ds

(Dataset({
     features: ['clean_text', 'spam', '__index_level_0__'],
     num_rows: 608
 }),
 Dataset({
     features: ['clean_text', 'spam', '__index_level_0__'],
     num_rows: 608
 }))

In [17]:
train_ds = train_ds.rename_column('spam','label')
valid_ds = valid_ds.rename_column('spam','label')

train_ds, valid_ds

(Dataset({
     features: ['clean_text', 'label', '__index_level_0__'],
     num_rows: 608
 }),
 Dataset({
     features: ['clean_text', 'label', '__index_level_0__'],
     num_rows: 608
 }))

In [18]:
model = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)
metric = load_metric("accuracy")

  metric = load_metric("accuracy")


In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
def tokenize_fn(example):
    return tokenizer(example['clean_text'], truncation=True)

train_tok_ds = train_ds.map(tokenize_fn, batched=True)
valid_tok_ds = valid_ds.map(tokenize_fn, batched=True)

Map: 100%|██████████| 608/608 [00:00<00:00, 17878.38 examples/s]
Map: 100%|██████████| 608/608 [00:00<00:00, 46757.18 examples/s]


In [21]:
train_tok_ds

Dataset({
    features: ['clean_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 608
})

In [22]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='../HuggingFace-Spam-Detection/',
    num_train_epochs=1,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=100,
    logging_steps=100,
    report_to='none'
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok_ds,
    eval_dataset=valid_tok_ds,
    compute_metrics=compute_metrics
)


In [23]:
trainer.train()

 16%|█▋        | 100/608 [00:06<00:15, 33.10it/s]

{'loss': 1.1039, 'learning_rate': 4.177631578947369e-05, 'epoch': 0.16}


                                                 
 17%|█▋        | 104/608 [00:09<02:12,  3.79it/s] 

{'eval_loss': 0.5244176983833313, 'eval_accuracy': 0.850328947368421, 'eval_runtime': 3.1, 'eval_samples_per_second': 196.127, 'eval_steps_per_second': 196.127, 'epoch': 0.16}


 33%|███▎      | 200/608 [00:12<00:12, 33.96it/s]

{'loss': 0.8274, 'learning_rate': 3.355263157894737e-05, 'epoch': 0.33}


                                                 
 34%|███▎      | 204/608 [00:15<01:40,  4.03it/s] 

{'eval_loss': 0.24453695118427277, 'eval_accuracy': 0.9424342105263158, 'eval_runtime': 2.9069, 'eval_samples_per_second': 209.154, 'eval_steps_per_second': 209.154, 'epoch': 0.33}


 49%|████▉     | 300/608 [00:18<00:09, 33.56it/s]

{'loss': 0.5375, 'learning_rate': 2.5328947368421052e-05, 'epoch': 0.49}


                                                 
 50%|█████     | 304/608 [00:21<01:16,  3.99it/s] 

{'eval_loss': 0.2520639896392822, 'eval_accuracy': 0.9457236842105263, 'eval_runtime': 2.9271, 'eval_samples_per_second': 207.715, 'eval_steps_per_second': 207.715, 'epoch': 0.49}


 66%|██████▌   | 400/608 [00:24<00:05, 34.80it/s]

{'loss': 0.3277, 'learning_rate': 1.7105263157894737e-05, 'epoch': 0.66}


                                                 
 66%|██████▋   | 404/608 [00:27<00:50,  4.05it/s] 

{'eval_loss': 0.11951925605535507, 'eval_accuracy': 0.975328947368421, 'eval_runtime': 2.901, 'eval_samples_per_second': 209.581, 'eval_steps_per_second': 209.581, 'epoch': 0.66}


 82%|████████▏ | 500/608 [00:30<00:02, 36.17it/s]

{'loss': 0.2167, 'learning_rate': 8.881578947368421e-06, 'epoch': 0.82}


                                                 
 82%|████████▏ | 500/608 [00:33<00:02, 36.17it/s] 

{'eval_loss': 0.11314544826745987, 'eval_accuracy': 0.975328947368421, 'eval_runtime': 2.8916, 'eval_samples_per_second': 210.261, 'eval_steps_per_second': 210.261, 'epoch': 0.82}


 99%|█████████▊| 600/608 [00:36<00:00, 34.80it/s]

{'loss': 0.2194, 'learning_rate': 6.578947368421053e-07, 'epoch': 0.99}


                                                 
 99%|█████████▉| 604/608 [00:39<00:01,  3.97it/s] 

{'eval_loss': 0.07573617249727249, 'eval_accuracy': 0.9868421052631579, 'eval_runtime': 2.9683, 'eval_samples_per_second': 204.828, 'eval_steps_per_second': 204.828, 'epoch': 0.99}


100%|██████████| 608/608 [00:39<00:00, 15.25it/s]

{'train_runtime': 39.8662, 'train_samples_per_second': 15.251, 'train_steps_per_second': 15.251, 'train_loss': 0.5317206664874806, 'epoch': 1.0}





TrainOutput(global_step=608, training_loss=0.5317206664874806, metrics={'train_runtime': 39.8662, 'train_samples_per_second': 15.251, 'train_steps_per_second': 15.251, 'train_loss': 0.5317206664874806, 'epoch': 1.0})

In [55]:
test = "When it comes to successful and educative trading transactions, I recommend @Educrypttofollow Him & learn how to earn automatically without your coins depreciating or liquidating.I just made 2500% profits under His guidance. Follow Him @Educryptto"
test2= "I love cryptocurrency"
inputs = tokenizer(test2, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  1045,  2293, 19888, 10085,  3126,  7389,  5666,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [56]:
model = model.to('cuda')

inputs = {k: v.to('cuda') for k, v in inputs.items()}

with torch.no_grad():
    logits = model(**inputs).logits
    probs = torch.softmax(logits, dim = 1)
    idx = int(probs[0].argmax().cpu().numpy())
predicted_class_id = logits.argmax().item()

predicted_class_id, probs


(0, tensor([[9.9986e-01, 1.4228e-04]], device='cuda:0'))

In [57]:
def spam(class_id):
    if class_id == 1:
        print(f"Tweet is spam. Certainty Level: {probs[0][idx]*100:.2f}%")
    else:
        print(f"Tweet is not spam. Certainty Levl: {probs[0][idx]*100:.2f}%")

spam(predicted_class_id)

Tweet is not spam. Certainty Levl: 99.99%
