### 1. Importing Libraries


In [None]:
import pandas as pd
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import time
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

### 2. Loading and Exploring the Data

In [None]:
train = pd.read_csv('data/fraud_email_train.csv',low_memory=False)
test = pd.read_csv("data/fraud_email_test.csv",low_memory=False)

In [None]:
train.head()
# train.describe()
# train.info()
train.columns

In [None]:
print(train['Label'].value_counts())
print(test['Label'].value_counts())

### 3. Data Cleaning and Preparation

#### Add a column as an data_ID: `train` and `test`

In [None]:
train['data_ID'] ='train'
test['data_ID'] = 'test'

#### Concat `train` and `test` into `data`

In [None]:
data = pd.concat([train, test])

#### Droppping Columns from data 

In [None]:
columns_to_drop = {'Folder-User','Folder-Name','Message-ID','Mime-Version','Content-Type','Content-Transfer-Encoding','Contains-Reply-Forwards',
                'X-FileName','X-Folder','X-From','X-Origin', 'Low-Comm','X-To','X-bcc','X-cc','Date','Suspicious-Folders','Mail-ID','Source','Cc',
                'Time','Attendees','Re','Unique-Mails-From-Sender'}

data.drop(columns=columns_to_drop,inplace=True)

#### Drop duplicates

In [None]:
print(data.duplicated().sum())
data.drop_duplicates(inplace=True)

#### Reformating `From` and `To`

In [None]:
data['From'] = data['From'].str.replace('.', ' ').str.replace('@', ' ').str.replace('com', '')
data['To'] = data['To'].str.replace('.', ' ').str.replace('@', ' ').str.replace('com', '')

#### Creating new column: `text`

In [None]:
# ['POI-Present', 'Sender-Type', 'Unique-Mails-From-Sender', 'Label']
data['text'] = data['From'] + ' ' + data['Body'] + ' ' + data['To'] + ' ' + data['Bcc'] +' ' + data['Subject']

data.drop(columns=['From','Body','To','Bcc','Subject'],inplace=True)

#### Deleting punctuaction of column `text`

In [None]:
data['text'] = data['text'].str.lower().replace(f'[{punctuation}]','',regex=True)

#### Drop nan values in text

In [None]:
data.dropna(subset='text',inplace=True)

#### Changing `POI-Present`

In [None]:
data['POI-Present'] = data['POI-Present'].map({False:0,True:1})

#### Reorder the columns

In [None]:
data = data[['text','POI-Present','Sender-Type','Label','data_ID']]

In [None]:
data['Sender-Type'].value_counts()

#### Spliting in `train_clean` and `test_clean`

In [None]:
train_clean = data[data['data_ID'] == 'train'].drop(columns='data_ID')
test_clean = data[data['data_ID']=='test'].drop(columns='data_ID')

### 4. Splitting and Vectorizing Data

In [None]:
x_train = train_clean.drop(columns='Label')
y_train = train_clean['Label'] 
x_test = test_clean.drop(columns='Label')
y_test = test_clean['Label']

In [None]:
vectorizer = CountVectorizer(stop_words='english')

x_train_text_vecto = vectorizer.fit_transform(x_train['text'])
x_test_text_vecto = vectorizer.transform(x_test['text'])

### 5. Model Building and Training


#### NB

In [None]:
model = MultinomialNB()

start_time = time.time()
model.fit(x_train_text_vecto,y_train)
predictions = model.predict(x_test_text_vecto)
end_time = time.time()

duration = end_time - start_time

model_name = 'NB - Multinomial'
confusion_matrix(y_test,predictions)

#### Decision Trees

In [None]:
model = DecisionTreeClassifier(criterion='entropy')

start_time = time.time()
model.fit(x_train_text_vecto,y_train)
predictions = model.predict(x_test_text_vecto)
end_time = time.time()

duration = end_time - start_time

model_name = 'Decision Tree - entropy'

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
model = DecisionTreeClassifier(criterion='gini')

start_time = time.time()
model.fit(x_train_text_vecto,y_train)
predictions = model.predict(x_test_text_vecto)
end_time = time.time()

duration = end_time - start_time

model_name = 'Decision Tree - entropy'

In [None]:
confusion_matrix(y_test,predictions)

#### SVM

In [21]:
model = SVC(kernel='sigmoid')

start_time = time.time()
model.fit(x_train_text_vecto,y_train)
predictions = model.predict(x_test_text_vecto)
end_time = time.time()

duration = end_time - start_time


model_name = 'SVM - kernel: sigmoid'

In [None]:
confusion_matrix(y_test,predictions)