In [8]:
from utils import load_spam_dataset
import pandas as pd

# Load Data

In [2]:
df_train, df_test = load_spam_dataset(load_train_labels=True)

Y_train = df_train['label'].values
Y_test = df_test['label'].values

df_train.head()

Unnamed: 0,author,date,text,label,video
0,Alessandro leite,2014-11-05T22:21:36,pls http://www10.vakinha.com.br/VaquinhaE.aspx...,1,1
1,Salim Tayara,2014-11-02T14:33:30,"if your like drones, plz subscribe to Kamal Ta...",1,1
2,Phuc Ly,2014-01-20T15:27:47,go here to check the views :3﻿,0,1
3,DropShotSk8r,2014-01-19T04:27:18,"Came here to check the views, goodbye.﻿",0,1
4,css403,2014-11-07T14:25:48,"i am 2,126,492,636 viewer :D﻿",0,1


# Build TFs and Evaluate

In [5]:
from SpamTransformationFunctions import *
tfs = [
    change_person,
    swap_adjectives,
    replace_verb_with_synonym,
    replace_noun_with_synonym,
    replace_adj_with_synonym
]

In [9]:
from utils import preview_tfs
pd.set_option('display.max_colwidth', None)
preview_tfs(df_train, tfs)

Unnamed: 0,TF Name,Original Text,Transformed Text
0,change_person,"""eye of the tiger"" ""i am the champion"" seems like katy perry is using titles of old rock songs for lyrics..﻿","""eye of the tiger"" ""i am the champion"" look like katy perry is using titles of old rock songs for lyrics..Ernest Morris"
1,swap_adjectives,hey guys look im aware im spamming and it pisses people off but please take a moment to check out my music. im a young rapper and i love to do it and i just wanna share my music with more people just click my picture and then see if you like my stuff,hey guys look im young im spamming and it pisses people off but please take a moment to check out my music. im a aware rapper and i love to do it and i just wanna share my music with more people just click my picture and then see if you like my stuff
2,replace_verb_with_synonym,"""eye of the tiger"" ""i am the champion"" seems like katy perry is using titles of old rock songs for lyrics..﻿","""eye of the tiger"" ""i am the champion"" seems like katy perry is use titles of old rock songs for lyrics..﻿"
3,replace_noun_with_synonym,Οh my god ... Roar is the most liked video at Vevo .. while 2 months ago was Justin's Baby.. congrats Katy . Applause &lt;3 ﻿,Οh my god ... Roar is the most liked video at Vevo .. while 2 calendar month ago was Justin's Baby.. congrats Katy . Applause &lt;3 ﻿
4,replace_adj_with_synonym,You gotta say its funny. well not 2 billion worth funny but still. It clicked and everything went uphill. At least you don't have JB's shit on #1.﻿,You gotta say its amusing . well not 2 billion worth funny but still. It clicked and everything went uphill. At least you don't have JB's shit on #1.﻿


# Applying Transformation Functions

In [10]:
from snorkel.augmentation import RandomPolicy, MeanFieldPolicy

random_policy = RandomPolicy(len(tfs), sequence_length=2, n_per_original=2, keep_original=True)
mean_field_policy = MeanFieldPolicy(
    len(tfs),
    sequence_length=2,
    n_per_original=2,
    keep_original=True,
    p=[0.05, 0.05, 0.3, 0.3, 0.3]
)

In [13]:
from snorkel.augmentation import PandasTFApplier

tf_applier = PandasTFApplier(tfs, mean_field_policy)
df_train_augmented = tf_applier.apply(df_train)
Y_train_augmented = df_train_augmented['label'].values

print(f'Original training set size: {len(df_train)}')
print(f'Augmented training set size: {len(df_train_augmented)}')

100%|█████████████████████████████████████████| 1586/1586 [00:18<00:00, 85.30it/s]


Original training set size: 1586
Augmented training set size: 2456


# Training a Model 

In [16]:
from utils import featurize_df_tokens, get_keras_lstm

X_train = featurize_df_tokens(df_train)
X_train_augmented = featurize_df_tokens(df_train_augmented)
X_test = featurize_df_tokens(df_test)

In [17]:
def train_and_test(X_train, Y_train, X_test=X_test, Y_test=Y_test, num_buckets=30000):
    "Define a vanilla LSTM model using Keras"
    
    lstm_model = get_keras_lstm(num_buckets)
    lstm_model.fit(X_train, Y_train, epochs=5, verbose=0)
    preds_test = lstm_model.predict(X_test)[:, 0]>0.5
    return(preds_test == Y_test).mean()

In [18]:
acc_augmented = train_and_test(X_train_augmented, Y_train_augmented)
acc_original = train_and_test(X_train, Y_train)

print(f"Test Accuracy (original training data): {100 * acc_original:.1f}%")
print(f"Test Accuracy (augmented training data): {100 * acc_augmented:.1f}%")

2021-08-30 14:48:16.599377: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-08-30 14:48:16.730217: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Test Accuracy (original training data): 47.2%
Test Accuracy (augmented training data): 71.6%
