#Adversarial Classification

In [59]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import recall_score, confusion_matrix

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[+-_/(){}!^?<>"''\[\]\|,;:.]')
# data cleaning
def clean_text(text):       # return terms with no duplication
  # change to lower-case
  #text = str(text).lower()
  text = text.replace('\n', ' ')
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  text = list(set(text.split(' ')))
  return text

def clean_text1(text):       # return terms with duplications
  # change to lower-case
  #text = str(text).lower()
  text = text.replace('\n', ' ')
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  text = list(text.split(' '))
  return text

In [None]:
n_spam = 432           # number of spam emails
n_legit = 2170            # number of legit emails
n_email = n_spam + n_legit      # total number of emails

In [None]:
# read the 1000 terms occurrence data 
df0 = pd.read_csv('term1000_occur.csv')
df0.index = df0['Unnamed: 0']
df0.drop(['Unnamed: 0'], axis=1, inplace=True)

nterm_s = 454430      # total number of terms in spam
nterm_l = 1488668      # total number of terms in legit
print('Total number of terms in spam:', nterm_s)
print('Total number of terms in legit:', nterm_l)

# get topN terms, N = 10
df1 = df0.iloc[0:10]
df1

Total number of terms in spam: 454430
Total number of terms in legit: 1488668


Unnamed: 0_level_0,#term in spam,#term in legit
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
language,19,7593
remove,415,30
free,905,196
university,39,5252
money,903,75
click,232,22
market,454,58
our,1090,623
business,734,90
today,243,104


##Baseline classifier

In [None]:
# use multinomial NB with binary features as the baseline algorithm
def Multinomial_B(text, df, n_spam=n_spam, n_legit=n_legit, n_email=n_email, nterm_s=nterm_s, nterm_l=nterm_l):
  ##
  Pr_s = n_spam/n_email
  Pr_l = n_legit/n_email
  Pr_il = 1
  Pr_is = 1

  for i in range(len(df)):
    if df.index[i] in text:
      Pr_is = ((df.iloc[i][0]+1)/(nterm_s+2))*Pr_is
      Pr_il = ((df.iloc[i][1]+1)/(nterm_l+2))*Pr_il

  Pr_si = Pr_is * Pr_s     # Pr of spam
  Pr_li = Pr_il * Pr_l     # Pr of legit

  return Pr_si/Pr_li

In [None]:
# test
result = []
result_true = []

# read the test data and split them into spam and legit
root = '/content/drive/My Drive/Colab Notebooks/data/lemm_stop/part10'
file_names = os.listdir(root)
#file_names = file_names[150:]

for file in file_names:
  f = open(root + '/' + file, 'r')
  text = f.read()
  text = clean_text(text)     # unique terms
  if file[0] != 's':
    result_true.append(0)
  else:
    result_true.append(1)
  f.close()

  # prediction
  pred = Multinomial_B(text, df1)

  if pred > 1:
    result.append(1)
  else:
    result.append(0)

print('False Negative Rate:', 1-recall_score(result_true, result))

False Negative Rate: 0.08163265306122447


The FNRate is very low, which means those predictions of ham are correct.



##Adversarial's strategy

The adversary uses the **ADD-WORDS** strategy. Adding a term to an email incurs unit cost.

In [None]:
# compute the reduction obtained by adding/removing each term in the 10 features
weight = {}
for i in range(len(df1)):
  term = df1.index[i]
  Pr_is = ((df1.iloc[i][0]+1)/(nterm_s+2))
  Pr_il = ((df1.iloc[i][1]+1)/(nterm_l+2))

  LO_1 = np.log(Pr_is/Pr_il)
  LO_0 = np.log((1-Pr_is)/(1-Pr_il))
  
  delta = max(LO_0-LO_1, 0)
  weight[term] = delta

In [None]:
df_w = pd.DataFrame(weight.values(), index=weight.keys(), columns=['weight'])
df_w.sort_values(by=['weight'], inplace=True, ascending=False)
print('terms to be add:')
df_add = df_w[df_w['weight']>0]
df_add

terms to be add:


Unnamed: 0,weight
language,4.757862
university,3.694532


In [62]:
# test
result = []
result_true = []
cost = 0

# read the test data and split them into spam and legit
root = '/content/drive/My Drive/Colab Notebooks/data/lemm_stop/part10'
file_names = os.listdir(root)
n_spam_test = 0    # total number of spam emails

for file in file_names:
  f = open(root + '/' + file, 'r')
  text = f.read()
  f.close() 
  text = clean_text(text)

  if file[0] != 's':
    result_true.append(0)
  else:              # modify spam emails
    result_true.append(1)
    n_spam_test += 1
    pred = Multinomial_B(text, df1)

    if pred > 1:
      # add terms if not exist
      term = df_add.index[0]   # only adding the first term(with highest weight)
      if term not in text:
        text.append(term)
        cost += 1
      pred = Multinomial_B(text, df1)

      if pred > 1:
        term = df_add.index[1]   # add the second term(binary feature)
        if term not in text:
          text.append(term)
          cost += 1
  
  # prediction
  pred = Multinomial_B(text, df1)

  if pred > 1:
    result.append(1)
  else:
    result.append(0)

cm = confusion_matrix(result_true, result)
print('After the attacker\'s modifications to test emails')
print('False Negative Rate:', 1-recall_score(result_true, result))
print('Average cost per spam:', cost/n_spam_test)

After the attacker's modifications to test emails
False Negative Rate: 0.7959183673469388
Average cost per spam: 1.469387755102041


After adding some non-spam features in the test emails, the FNRate increases a lot which means many spam emails are predicted as ham now.

This is the highest FNRate I can get since all the two non-spam features have been added in the email which was predicted as spam, and since the classifier I used is based on binary feature, we cannot add same features multiple times.

In [45]:
print([result])
print([result_true])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

##Classifier response

In [52]:
############## adversary's modification on train data ###################
'''
legit = []
spam = []

# read the test data and split them into spam and legit
for i in range(1, 10):
  root = '/content/drive/My Drive/Colab Notebooks/data/lemm_stop/part' + str(i)
  file_names = os.listdir(root)

  for file in file_names:
    f = open(root + '/' + file, 'r')
    text = f.read()
    text = clean_text1(text)
    
    if file[0] != 's':
      legit.append(text)
    else:                   # add terms in spam
      pred = Multinomial_B(text, df1)

      if pred > 1:
        # add terms if not exist
        term = df_add.index[0]   # only adding the first term(with highest weight)
        if term not in text:
          text.append(term)
        pred = Multinomial_B(text, df1)

        if pred > 1:
          term = df_add.index[1]   # add the second term(binary feature)
          if term not in text:
            text.append(term)  
      spam.append(text)
    f.close()

print(len(spam))
print(len(legit))
'''

432
2170


In [53]:
'''
occur = {}
for term in df1.index:
  #term = term.replace("'", "")
  spam_occ = 0    # number of spam email with term
  legit_occ = 0    # number of legit email with term
  # compute the number of legit and spam with each term in corpus
  for i in range(len(spam)):
    for j in range(len(spam[i])):
      if term == spam[i][j]:
        spam_occ += 1
  for i in range(len(legit)):
    for j in range(len(legit[i])):
      if term == legit[i][j]:
        legit_occ += 1
  occur[term] = [spam_occ, legit_occ]
'''

In [54]:
'''
df0 = pd.DataFrame(data=occur.values(), index=occur.keys(), columns=['#term in spam', '#term in legit'])
df0.to_csv('term10_occur.csv')
'''

In [55]:
# read the 1000 terms occurrence data 
df0 = pd.read_csv('term10_occur.csv')
df0.index = df0['Unnamed: 0']
df0.drop(['Unnamed: 0'], axis=1, inplace=True)
'''
nterm_s = 0
nterm_l = 0
for i in range(len(spam)):
  nterm_s += len(spam[i])
for i in range(len(legit)):
  nterm_l += len(legit[i])
'''
nterm_s = 455076
nterm_l = 1488668
print('Total number of terms in spam:', nterm_s)
print('Total number of terms in legit:', nterm_l)

# get topN terms, N = 10
df0

Total number of terms in spam: 455076
Total number of terms in legit: 1488668


Unnamed: 0_level_0,#term in spam,#term in legit
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
language,404,7593
remove,415,30
free,905,196
university,300,5252
money,903,75
click,232,22
market,454,58
our,1090,623
business,734,90
today,243,104


In [64]:
# test
result = []
result_true = []

# read the test data and split them into spam and legit
root = '/content/drive/My Drive/Colab Notebooks/data/lemm_stop/part10'
file_names = os.listdir(root)

for file in file_names:
  f = open(root + '/' + file, 'r')
  text = f.read()
  f.close() 
  text = clean_text(text)

  if file[0] != 's':
    result_true.append(0)
  else:              # modify spam emails
    result_true.append(1)
    pred = Multinomial_B(text, df1)

    if pred > 1:
      # add terms if not exist
      term = df_add.index[0]   # only adding the first term(with highest weight)
      if term not in text:
        text.append(term)
      pred = Multinomial_B(text, df1)

      if pred > 1:
        term = df_add.index[1]   # add the second term(binary feature)
        if term not in text:
          text.append(term)
  
  # prediction
  pred = Multinomial_B(text, df0)

  if pred > 1:
    result.append(1)
  else:
    result.append(0)

print('After the response of classifier')
print('False Negative Rate:', 1-recall_score(result_true, result))
cm = confusion_matrix(result_true, result)
print('False Positive Rate:', cm[0][1]/(cm[0][0]+cm[0][1]))

After the response of classifier
False Negative Rate: 0.30612244897959184
False Positive Rate: 0.15289256198347106


In [65]:
print([result])
print([result_true])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

**Report**

* Baseline result:

False Negative Rate: 0.08163265306122447

* After the attacker's modifications to test emails:

False Negative Rate: 0.7959183673469388

Average cost per spam: 1.469387755102041

* After the response of classifier:

False Negative Rate: 0.30612244897959184

False Positive Rate: 0.15289256198347106