In [1]:
from fastai2.text.all import *
from fastai2.tabular.all import *
from lazylabel.text.all import *

In [2]:
source = Path('/home/lgvaz/.data/youtube_spam/')

In [3]:
fns = get_files(source, extensions='.csv')
dfs = fns.map(pd.read_csv)

In [4]:
df = pd.concat(dfs)
df['original'] = df['CONTENT'] # Hack for tokenizer cache
print(df.shape); df.head(2)

(1956, 6)


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS,original
0,z12pgdhovmrktzm3i23es5d5junftft3f,lekanaVEVO1,2014-07-22T15:27:50,i love this so much. AND also I Generate Free Leads on Auto Pilot &amp; You Can Too! http://www.MyLeaderGate.com/moretraffic﻿,1,i love this so much. AND also I Generate Free Leads on Auto Pilot &amp; You Can Too! http://www.MyLeaderGate.com/moretraffic﻿
1,z13yx345uxepetggz04ci5rjcxeohzlrtf4,Pyunghee,2014-07-27T01:57:16,http://www.billboard.com/articles/columns/pop-shop/6174122/fan-army-face-off-round-3 Vote for SONES please....we're against vips....please help us.. &gt;.&lt;﻿,1,http://www.billboard.com/articles/columns/pop-shop/6174122/fan-army-face-off-round-3 Vote for SONES please....we're against vips....please help us.. &gt;.&lt;﻿


## Artificially divide into labelled an unlabelled df

In [5]:
splits = RandomSplitter(.5)(df)
df_lbl, df_ulbl = [df.iloc[i] for i in splits]
len(df_lbl), len(df_ulbl)

(978, 978)

## Create labellers

In [6]:
ABSTAIN,SPAM,HAM = 'abstain','spam','ham'
vocab = [ABSTAIN,SPAM,HAM]

Splits and labels!

In [7]:
splits = RandomSplitter()(df_lbl)

In [8]:
cder,tkzer,nmzer = map(mk_transform, (ColReader('original'), Tokenizer.from_df('CONTENT'), Numericalize()))
x_tls = TfmdLists(df_lbl, [cder,tkzer,nmzer], splits=splits)

In [9]:
lbl_dict = {1:SPAM, 0:HAM}
y_tls = TfmdLists(df_lbl, [ColReader('CLASS'), lbl_dict.get, Categorize(vocab)], splits=splits)

In [10]:
dset = Datasets(tls=[x_tls, y_tls])

In [11]:
labeller = Labeller()

Helper function for keyword matches

In [12]:
def keyword_match(ws, label=SPAM):
    ws = L(ws)
    def _inner(x):
        for w in ws:
            if w in x: return label
        return ABSTAIN
    _inner.__name__ = f'kw_{ws[0]}'
    labeller.register_func(cder, _inner)

In [13]:
keyword_match('my')
keyword_match('subscribe')
keyword_match(['http', 'https'])
keyword_match(['please', 'plz', 'pliz'])
keyword_match(['song'], label=HAM)

In [14]:
@labeller(cder)
def regex_check(x):
    return SPAM if re.search(r'(check\s(my|this))|check\s(\w+)?\s?out', x, flags=re.I) else ABSTAIN

In [15]:
@labeller(tkzer)
def short_comment(x): return HAM if len(x) < 6 else ABSTAIN

TextBlob

In [16]:
from textblob import TextBlob
@labeller(cder, TextBlob)
def tb_polarity(x, tb):
    return HAM if tb.polarity > 0.9 else ABSTAIN
@labeller(cder, TextBlob)
def tb_sentiment(x, tb):
    return HAM if tb.subjectivity >= 0.5 else ABSTAIN

Spacy

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')
def spacy_doc(x): return nlp(str(x))

In [18]:
@labeller(cder, spacy_doc)
def has_person(x, sdoc):
    if len(sdoc) < 20 and any([ent.label_=='PERSON' for ent in sdoc.ents]):
        return HAM
    return ABSTAIN

In [19]:
tasks = tasks_labels(labeller, x_tls, vocab)
dset = Datasets(tls=[tasks, y_tls])
dls = dset.dataloaders()

In [20]:
labeller.summary(dls.train)

Unnamed: 0,Coverage,Polarity,Accuracy,Correct,Incorrect
kw_my,14.06% (108),1,0.851852,92,16
kw_subscribe,8.07% (62),1,0.967742,60,2
kw_http,10.03% (77),1,0.961039,74,3
kw_please,5.99% (46),1,1.0,46,0
kw_song,13.80% (106),1,0.726415,77,29
regex_check,25.00% (192),1,1.0,192,0
tb_polarity,5.21% (40),1,0.8,32,8
tb_sentiment,37.11% (285),1,0.554386,158,127
has_person,5.86% (45),1,0.622222,28,17
short_comment,11.98% (92),1,0.902174,83,9
