In [1]:
SAMPLE_SIZE = None

# %pip install nltk sklearn
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize   
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import nltk
import scipy
from scipy.sparse import hstack
from sklearn.preprocessing import Normalizer
import numpy as np

wordnet_lemma  = WordNetLemmatizer()

clf = MultinomialNB()

def lemmatize(word, pos='v'):
    return wordnet_lemma.lemmatize(word, pos=pos)

stop = stopwords.words('english')

def pre_process(text):
    text = re.sub('<.*?>', ' ', text) # Remove html tags. Added flag if "<div" is found later on process
    text = re.sub(r'=\d\w',' ',text) # Remove encoded symbols like =2E, =3A....    
    return text

def tokenize_lemmatize(text):
    out = word_tokenize(text)
    out = [lemmatize(word,
#                      pos="v"
                    ) for word in out]
    out = [word for word in out if word not in stop]
    return out

def process_data(df, vectorizer, normalizer, fit=False):
    x_text = df['text'].apply(pre_process)
    x_len = np.array(df['text'].apply(len)).reshape(-1, 1)
    if fit:
        vectorizer.fit(x_text)
        normalizer.fit(x_len)
    x_text = vectorizer.transform(x_text)
    x_len = normalizer.transform(x_len)
    
    x_html = np.array(df['text'].str.contains('<div', case=False)).reshape(-1, 1)
    X = hstack((x_text, x_len, x_html))
    if 'label' in df:
        Y = df['label']
    else:
        Y = None
    return X, Y
               
    
    
data = pd.read_csv("data/kg_train.csv",encoding='latin-1')#.sample(100)


# Transform Data
# Change vectorizer for testing if required
vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tokenize_lemmatize)
# vectorizer = CountVectorizer(tokenizer=tokenize_lemmatize, ngram_range=(1,4))

# Normalizer for the email lenght
normalizer = Normalizer()

X, Y = process_data(data, vectorizer, normalizer, fit=True)


# Evaluate
print(cross_validate(clf, X, Y, n_jobs=4, cv=5, scoring='f1')['test_score'])

# Train
clf.fit(X, Y)

test_data = pd.read_csv("data/kg_test.csv",encoding='latin-1')

X_test, Y_test = process_data(test_data, vectorizer, normalizer)

result = clf.predict(X_test)

df = pd.DataFrame(result, columns=['Category'])
df['Id'] = df.index
df = df[['Id','Category']]
df.to_csv('prediction.csv', index=False)
print('Done!')



[nltk_data] Downloading package stopwords to /home/jano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[0.98643411 0.98926829 0.98627451 0.98527969 0.98729228]
Done!


In [2]:
# y_pred = clf.predict(X)
# (Y==y_pred).all()
# clf.score(X, Y)

0.9983232729711603

In [3]:
# test_data = pd.read_csv("data/kg_test.csv",encoding='latin-1')

# X_test, Y_test = process_data(test_data, vectorizer, normalizer)

In [4]:
# result = clf.predict(X_test)

# df = pd.DataFrame(result, columns=['Category'])
# df['Id'] = df.index
# df = df[['Id','Category']]
# df.to_csv('prediction.csv', index=False)



In [2]:
# Uncomment for submission
!kaggle competitions submit -c dsub-fraudulentemails -f prediction.csv -m "Message"


100%|██████████████████████████████████████| 39.7k/39.7k [00:02<00:00, 14.9kB/s]
Successfully submitted to Fraudulent E-mails: Spam or Ham?