In [1]:
from fastai.text.all import *
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np

In [3]:
df = pd.read_csv("./nlp_data.csv")
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,clean_text,spam
0,"Wow, what a view",0
1,Buy the Horizen (Zen) It’s the good one,0
2,Not if only I knew which were the good ones been holding $MINA for a while which I thought was a good one but quite frankly who knows now!!,0
3,how you feel about $KAS?,0
4,$vet?,0


In [4]:
# Create a language model data loader 

dls_lm = DataBlock(
    blocks=TextBlock.from_df('clean_text', is_lm=True),
    get_x=ColReader('text'), 
    splitter=RandomSplitter(0.15) 
    # using only 10% of entire comments data for validation inorder to learn more
)
dls_lm = dls_lm.dataloaders(df, bs=32, seq_len=64)

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `n_workers` has to be changed to 0 to avoid getting stuck


In [5]:
dls_lm.show_batch(max_n=1)

Unnamed: 0,text,text_
0,"xxbos gang gang . xxwrep 3 yes . gang gang . 1 xxrep 3 0 xxunk leverage so good ! gang gang xxunk long ! xxbos xxmaj still waiting for a xxunk xxup xxunk of the xxunk which will xxunk xxunk life xxbos xxmaj xxunk xxunk more xxunk than the xxunk makes is bad , xxrep 4 m xxunk . xxbos xxmaj long xxup","gang gang . xxwrep 3 yes . gang gang . 1 xxrep 3 0 xxunk leverage so good ! gang gang xxunk long ! xxbos xxmaj still waiting for a xxunk xxup xxunk of the xxunk which will xxunk xxunk life xxbos xxmaj xxunk xxunk more xxunk than the xxunk makes is bad , xxrep 4 m xxunk . xxbos xxmaj long xxup bnb"


In [6]:
# create learner to train language model on diction used on twitter

lm_learn = language_model_learner(
    dls_lm, AWD_LSTM,
    metrics=[accuracy, Perplexity()]).to_fp16()

lm_learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,5.232421,4.798075,0.203125,121.276703,00:03


In [7]:
lm_learn.unfreeze()
lm_learn.fit_one_cycle(10,2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.695661,4.404264,0.244792,81.798912,00:00
1,4.458885,3.996288,0.271701,54.395847,00:00
2,4.243706,3.671005,0.295139,39.291382,00:00
3,4.053185,3.54638,0.314236,34.687523,00:00
4,3.886508,3.497365,0.322049,33.028309,00:00
5,3.734357,3.453647,0.330729,31.615496,00:00
6,3.603235,3.428527,0.330729,30.831184,00:00
7,3.482838,3.42591,0.334201,30.750629,00:00
8,3.383778,3.426374,0.331597,30.7649,00:00
9,3.301838,3.426922,0.335069,30.781761,00:00


In [8]:
# save fine-tuned language model
lm_learn.save_encoder('finetuned')

### We can actually test out the language model now and see if it can read like a spam reply:
* Its certainly not perfect, but were working with very little data

In [15]:
text = "Does someone have the link"
N_WORDS = 10
N_SENTENCES = 1
preds = [lm_learn.predict(text,N_WORDS, temperature=.75) for _ in range(N_SENTENCES)]

print("\n".join(preds))

Does someone have the link to your wallet and send wallet for us ?


### Now we use the fine-tuned language model and turn it into a text classifier:

In [16]:
blocks = (TextBlock.from_df('clean_text', seq_len=dls_lm.seq_len, vocab=dls_lm.vocab), CategoryBlock())
dls = DataBlock(blocks=blocks,
                get_x=ColReader('text'),
                get_y=ColReader('spam'),
                splitter=RandomSplitter(0.2))
dls = dls.dataloaders(df, bs=32)

# classifier learner, load finetuned model
learn_clas = text_classifier_learner(dls, AWD_LSTM, metrics=[accuracy, FBeta(beta=1)]).to_fp16()
learn_clas.load_encoder('finetuned')

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `n_workers` has to be changed to 0 to avoid getting stuck


<fastai.text.learner.TextLearner at 0x20096e53f48>

In [18]:
learn_clas.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,fbeta_score,time
0,0.715456,0.590197,0.862069,0.25,00:00


In [19]:
# gradually unfreeze layers
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,fbeta_score,time
0,0.614273,0.425988,0.91954,0.758621,00:00


In [20]:
# gradually unfreeze layers
learn_clas.freeze_to(-3)
learn_clas.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,fbeta_score,time
0,0.475148,0.415734,0.942529,0.827586,00:00


In [50]:
# unfreeze all layers
learn_clas.unfreeze()
learn_clas.fit_one_cycle(3, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,fbeta_score,time
0,0.262995,0.262827,0.908046,0.764706,00:00
1,0.250027,0.240356,0.942529,0.83871,00:00
2,0.24343,0.222317,0.942529,0.83871,00:00


### Can make some predictions on some common replies:

In [51]:
def make_pred(learner, text:string):
    pred = learner.predict(text)
    if int(pred[0]) == 1:
        print(f"SPAM --- Certainty: {round(pred[2].max().numpy()*100,2)}%")
    else:
        print(f"NOT SPAM --- Certainty: {round(pred[2].max().numpy()*100,2)}%")

In [52]:
make_pred(learner= learn_clas, text = "On longer periods, #Bitcoin doesn't appear to be in poor shape. A healthy correction has occurred, and as long as #Bitcoin holds above $30,000, it's probable that the surge will continue until $35,000 Y'all follow @btchill_ for best predictions  on crypto")

SPAM --- Certainty: 60.72%


In [53]:
make_pred(learner= learn_clas, text = 'Huh? Could someone link me to that defi money making" method people have been using? Saw an article about it but cannot find the  full tutorial?')

SPAM --- Certainty: 98.63%


In [54]:
make_pred(learner= learn_clas, text = "..before I forget - I have opened a few more spaces in my portfolio & fund section for only $30 - so don't miss out again!")

NOT SPAM --- Certainty: 59.64%


In [55]:
make_pred(learner= learn_clas, text = "follow @justin for trades that make money")

SPAM --- Certainty: 64.7%
