In [180]:
"""
The purpose of this program is to classify the emails 
based on a training dataset and predict the upcoming emails
into their corresponding categories
"""
import csv
import numpy as np
import pandas as pd

# importing email dataset from the csv
dataset = pd.read_csv('test.csv', encoding='iso-8859-1')

In [181]:
# Replacing the special characters by empty strings
dataset['Subject'] = dataset['Subject'].map(lambda x: x.replace('\xe6', ''))
dataset['Subject'] = dataset['Subject'].map(lambda x: x.replace('\r', ''))
dataset['Message'] = dataset['Message'].map(lambda x: x.replace('\xe6', ''))
dataset['Message'] = dataset['Message'].map(lambda x: x.replace('\r', ''))

# Replacing the string labels into 0,1,2. Please refer to the below table for mapping
# 'Credit and account statements' => 0
# 'Direct debit and Bacs' => 1
# 'Account closures' => 2
dataset['Class'] = dataset['Class'].map(lambda x: x.replace('Credit and account statements', '0'))
dataset['Class'] = dataset['Class'].map(lambda x: x.replace('Direct debit and Bacs', '1'))
dataset['Class'] = dataset['Class'].map(lambda x: x.replace('Account closures', '2'))

In [182]:
# Extracting the features and labels from the cleaned dataset
# Converting the 2d features dataset into 1d featrures dataset
features = dataset[['Subject','Message']].values;
one_d_features = []
for i in features:
    one_d_features.append("".join(i).split("/n"))
one_d_features = [j for i in one_d_features for j in i]

In [183]:
# Implementing cross validation
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(one_d_features, labels, 
                                                                            test_size=0.30, random_state=42)
print("Training features lenght: ",len(features_train))
print("Training labels lenght: ",len(labels_train))
print("Testing features lenght: ",len(features_test))
print("Testing labels lenght: ",len(labels_test))

Training features lenght:  42
Training labels lenght:  42
Testing features lenght:  19
Testing labels lenght:  19


In [184]:
# Converting the 2d labels dataset into 1d labels dataset
labels_train = np.array(sum(labels_train, []))
labels_test = np.array(sum(labels_test, []))

In [185]:
# Bag of words method
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(features_train)

In [186]:
# Bag of words method
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(labels_train)

['0' '2' '0' '2' '2' '1' '2' '0' '2' '1' '0' '1' '1' '1' '1' '2' '0' '1'
 '2' '2' '1' '1' '2' '0' '1' '0' '2' '1' '1' '1' '2' '0' '1' '0' '2' '1'
 '0' '2' '0' '1' '2' '1']


In [187]:
#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB().fit(X_train_tfidf, labels_train)
#from sklearn import svm
#clf = svm.SVC().fit(X_train_tfidf, labels_train)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(X_train_tfidf, labels_train)

In [188]:
"""from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
text_clf = text_clf.fit(features_train, labels_train)"""

"""from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.SVC(kernel="linear"))])
text_clf = text_clf.fit(features_train, labels_train)"""

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier())])
text_clf = text_clf.fit(features_train, labels_train)

In [192]:
import numpy as np
predicted = text_clf.predict(features_test)
print( "Mean: ",np.mean(predicted == labels_test))

Mean:  0.631578947368


In [193]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(predicted, labels_test)
print ("Accuracy: ",acc*100)

Accuracy:  63.1578947368
