# E-mail classification
Service departments are often providing a central e-mailadres to their (internal or external) customers for reporting issues. Often, different topics are handled by different people or departments. Wouldn’t it be great to have an automatic e-mail classifier that forwards e-mails to the right person? Let’s try to make this.  
  
We start from a set of 9.820 real e-mails originating from several e-mail services like monster.com, nieuwsblad.be, datanews.be, etc.. The e-mails belong to four categories: _advertisements_ , _job offerings_ , _news_ and _ICT_ . We’d like to create a classifier that can be used to automatically classify e-mails into one of the four categories ADS, JOB, NEWS and ICT. 
We start by importing and exploring the data. 

In a previous exercise we used word embeddings and a neural network for document classification, but do we really need these advanced and compute-intensive techniques?

Create a model based on based on TFidVectorizer iso word embeddings and use a voting classifier.   

Compare the accuracy of this model to the accuracy of the neural network you created above. 


In [1]:
# Importing the necessary packages
import numpy as np                                  # "Scientific computing"
import scipy.stats as stats                         # Statistical tests

import pandas as pd                                 # Data Frame
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt                     # Basic visualisation

import seaborn as sns                               # Advanced data visualisation

from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import pandas as pd

# fix random seed for reproducibility
seed = 2020
np.random.seed(seed)  

import sklearn as sk
from sklearn.model_selection import train_test_split


import nltk


In [2]:
if 'google.colab' in str(get_ipython()):
    colab = True
    print ('You are running on Google Colab')
else:
    colab = False
    print ('You are not running on Google Colab')

if colab:
    from google.colab import drive
    drive.mount('/content/gdrive')

You are not running on Google Colab


Read the files

In [3]:
import numpy as np
import pandas as pd

if colab:
    ads = pd.read_csv('https://raw.githubusercontent.com/jdecorte/machinelearning/main/datasets/emailADS.csv',encoding = "ISO-8859-1")
    ict = pd.read_csv('https://raw.githubusercontent.com/jdecorte/machinelearning/main/datasets/emailICT.csv',encoding = "ISO-8859-1")
    job = pd.read_csv('https://raw.githubusercontent.com/jdecorte/machinelearning/main/datasets/emailJOB.csv',encoding = "ISO-8859-1")
    news = pd.read_csv('https://raw.githubusercontent.com/jdecorte/machinelearning/main/datasets/emailNEWS.csv',encoding = "ISO-8859-1")
else:
    ads = pd.read_csv('datasets/emailADS.csv',encoding = "ISO-8859-1")
    ict = pd.read_csv('datasets/emailICT.csv',encoding = "ISO-8859-1")
    job = pd.read_csv('datasets/emailJOB.csv',encoding = "ISO-8859-1")
    news = pd.read_csv('datasets/emailNEWS.csv',encoding = "ISO-8859-1")



In [4]:
data = ads.append([ict,job,news])

  data = ads.append([ict,job,news])


In [5]:
data = data[['Subject','Body','Category']]  # we only keep these three columns 
data['Text'] = data['Subject'] + ' ' + data['Body']
data = data.drop(['Subject','Body'],axis=1)
data.head()

Unnamed: 0,Category,Text
0,ADS,Alles halve prijs - of nog veel goedkoper! <h...
1,ADS,Alles halve prijs - of nog veel goedkoper! <h...
2,ADS,Wat krijgen we nu?! Weer EXTRA korting? <http...
3,ADS,Wat krijgen we nu?! Weer EXTRA korting? <http...
4,ADS,"Armband met activiteitstracker, Apple iPhone 6..."


In [6]:
# language detection - this can take a while
from langdetect import detect
data['Lang'] = data['Text'].apply(detect)

In [7]:
pd.pivot_table(data, values='Text', index=['Category'],columns=['Lang'], aggfunc='count').fillna(0)

Lang,ca,de,en,fr,nl
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ADS,61.0,112.0,751.0,2.0,1558.0
ICT,0.0,0.0,1140.0,0.0,223.0
JOB,0.0,6.0,960.0,5.0,611.0
NEWS,0.0,8.0,8.0,0.0,4375.0


In [8]:
# Using the mails in dutch and english gives a better result (see Exploration 3)
data = data[(data['Lang'] == "nl") | (data['Lang'] == "en")]
data.head()

Unnamed: 0,Category,Text,Lang
0,ADS,Alles halve prijs - of nog veel goedkoper! <h...,nl
1,ADS,Alles halve prijs - of nog veel goedkoper! <h...,nl
2,ADS,Wat krijgen we nu?! Weer EXTRA korting? <http...,nl
3,ADS,Wat krijgen we nu?! Weer EXTRA korting? <http...,nl
4,ADS,"Armband met activiteitstracker, Apple iPhone 6...",nl


In [9]:
# remove stopwords and punctuations
import nltk
nltk.download('stopwords')
# nltk.download('punkt')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import string
    
stop_words_nl = set(stopwords.words('dutch')) 

from nltk.stem.snowball import SnowballStemmer

dutchStemmer=SnowballStemmer("dutch")

punctuations="?:!.,;<>/\+-"

# define functions to remove digits stopwords from a string
# I know we are copy-pasting code, but this is for now the simpliest way

def remove_stopwords_nl(s):
    word_tokens = word_tokenize(s.lower()) # turn the string into a list of words based on separators (blank, comma, etc.)
    filtered_sentence = ""
    result = [dutchStemmer.stem(x) for x in word_tokens if x not in stop_words_nl and x not in punctuations]
    seperator = ' '
    return seperator.join(result)

data['Text'] = data['Text'].apply(remove_stopwords_nl)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcor864\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import re

def clean_pieces_urls(piece):
  return re.sub(r"[,.;@#?!&$/=]+\ *", " ", piece)

data['Text'] = data['Text'].apply(clean_pieces_urls)

In [11]:
data.Category.value_counts()

NEWS    4383
ADS     2309
JOB     1571
ICT     1363
Name: Category, dtype: int64

In [12]:
dict_map = {'NEWS': 0, 'ADS': 1, 'JOB': 2, 'ICT': 3}
data['Category'] = data['Category'].map(dict_map)

In [13]:
# Extract a training & validation split
from sklearn.model_selection import train_test_split
X = data['Text']
y = data['Category']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=42)

In [14]:
print("Shape of x_train:", X_train.shape)
print("Shape of x_test:", X_test.shape)

print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of x_train: (7700,)
Shape of x_test: (1926,)
Shape of y_train: (7700,)
Shape of y_test: (1926,)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression(solver='lbfgs',random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100,random_state=42)
svm_clf = SVC(gamma='scale',random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)]
)

model = Pipeline([('tfid',TfidfVectorizer()),('voting',voting_clf)])

model.fit(X_train,y_train)
categories = model.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,categories))

0.9984423676012462
