In [25]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [26]:
import numpy as np
import pandas as pd
import sys

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style = "whitegrid", 
        color_codes = True,
        font_scale = 1.5)

from datetime import datetime
from IPython.display import display, HTML

In [27]:
!pip install pandas --upgrade



In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [29]:
original_training_data = pd.read_csv('emails.csv')

original_training_data['subject'] = original_training_data['text'].apply(lambda x: x.split('  ', maxsplit=1)[0])
original_training_data['body'] = original_training_data['text'].apply(lambda x: x.split('  ', maxsplit=1)[1] if len(x.split('  ', maxsplit=1)) > 1 else '')

In [30]:
original_training_data = original_training_data.fillna('')
original_training_data = original_training_data.drop('text', axis=1)
original_training_data = original_training_data.reset_index()
original_training_data

Unnamed: 0,index,spam,subject,body
0,0,1,Subject: naturally irresistible your corporate...,lt is really hard to recollect a company : the...
1,1,1,Subject: the stock trading gunslinger,fanny is merrill but muzo not colza attainder ...
2,2,1,Subject: unbelievable new homes made easy,im wanting to show you this homeowner you ha...
3,3,1,Subject: 4 color printing special,request additional information now ! click her...
4,4,1,"Subject: do not have money , get software cds ...",software compatibility . . . . ain ' t it grea...
...,...,...,...,...
5723,5723,0,Subject: re : research and development charges...,here it is ! - - - - - - - - - - - - - - - - ...
5724,5724,0,Subject: re : receipts from visit,"jim , thanks again for the invitation to visi..."
5725,5725,0,Subject: re : enron case study update,wow ! all on the same day . that ' s super . t...
5726,5726,0,Subject: re : interest,"david , please , call shirley crenshaw ( my a..."


In [31]:
X = original_training_data.drop(['spam'], axis=1)
y = original_training_data['spam']

In [32]:
tfidf_vectorizer_subject = TfidfVectorizer()
tfidf_vectorizer_body = TfidfVectorizer()

In [33]:
X_subject_tfidf = tfidf_vectorizer_subject.fit_transform(X['subject'])
X_body_tfidf = tfidf_vectorizer_body.fit_transform(X['body'])

In [34]:
pickle.dump(X_subject_tfdif, open('tfidf_vectorizer_subject.pkl', 'wb'))
pickle.dump(X_body_tfidf, open('tfidf_vectorizer_body.pkl', 'wb'))

NameError: name 'X_subject_tfdif' is not defined

In [None]:
X_combined = hstack([X_subject_tfidf, X_body_tfidf])

In [None]:
import re

def words_in_texts(words, texts):
    indicator_array = []
    for i in texts:
        arr = []
        for j in words:
            if j in i:
                arr.append(1)
            else:
                arr.append(0)
        indicator_array.append(arr)
    return np.asarray(indicator_array)

def num_words(text):
    return len(text.split())

def re_or_fw(text):
    match = re.search(r"(fw :|re :)", text)
    return int(match is not None)

def special_char(text):
    match = re.findall(r"([^\w ])", text)
    return len(match)

In [None]:
words = ['offer', 'help', 'win', 'price', 'card']
words_df = pd.DataFrame(words_in_texts(words, X['subject']))
words_df['spam'] = y
words_df_melt = words_df.melt('spam')
sns.barplot(x=words_df_melt['variable'], y=words_df_melt['value'], hue=words_df_melt['spam']).set(xticklabels=words)
plt.xlabel('Words')
plt.ylabel('Proportion of emails')
plt.title('Words and the proportion of emails they are found in')

In [None]:
body_num_words = np.array(original_training_data['body'].apply(num_words)).reshape(-1, 1)
subject_num_words = np.array(original_training_data['subject'].apply(num_words)).reshape(-1, 1)
re_or_fw_feature = np.array(original_training_data['subject'].astype(str).apply(re_or_fw)).reshape(-1, 1)
subject_char = np.array(original_training_data['subject'].apply(len)).reshape(-1, 1)
body_char = np.array(original_training_data['body'].apply(len)).reshape(-1, 1)
subject_special = np.array(original_training_data['subject'].astype(str).apply(special_char)).reshape(-1, 1)
body_special = np.array(original_training_data['body'].apply(special_char)).reshape(-1, 1)

In [None]:
words = ['offer', 'help', 'win', 'price', 'card']
subject_words_in_texts = words_in_texts(words, original_training_data['subject'])
body_words_in_texts = words_in_texts(words, original_training_data['body'])

In [None]:
from scipy.sparse import csr_matrix
subject_words_in_texts_sparse = csr_matrix(subject_words_in_texts)
body_words_in_texts_sparse = csr_matrix(body_words_in_texts)

In [None]:
X_combined_new = hstack([X_combined, body_num_words, subject_num_words, re_or_fw_feature, subject_char, body_char, subject_special, body_special, subject_words_in_texts_sparse, body_words_in_texts_sparse])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_combined_new, y, test_size = 0.2, random_state = 42)

In [None]:
Y_train

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
import re
from collections import Counter

In [None]:
model = LogisticRegression(max_iter=10000)

In [35]:
model.fit(X_train, Y_train)

In [36]:
y_pred = model.predict(X_test)
print(accuracy_score(Y_test, y_pred))

0.9659685863874345


In [37]:
import pickle

pickle.dump(model, open('email_spam_predictor_new.sav', 'wb'))