In [1]:
import IPython
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
import sklearn as skl
from sklearn import tree
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [2]:
# Constants
FEATURE_FILE_NAME_TRAIN = 'Data/train-features.txt'
TARGET_FILE_NAME_TRAIN = 'Data/train-labels.txt'

FEATURE_FILE_NAME_TEST = 'Data/test-features.txt'
TARGET_FILE_NAME_TEST = 'Data/test-labels.txt'

N_COL = 2500 # number of unique words from dataset

# number of data entries
N_ROW_TRAIN = 700 
N_ROW_TEST = 260

In [3]:
def create_df(feature_file, target_file, n_col, n_row):  
    # Initialze data frame
    df = pd.DataFrame(columns=range(1,n_col + 1), index=range(1, n_row + 1))
    # Import data and populate data frame
    with open(feature_file) as f:
        for _, line in enumerate(f):
            line = line.split(' ')  # each line in file
            doc = int(line[0])
            word_id = int(line[1])
            freq = int(line[2])     
            df[word_id][doc] = freq
    # Replace NaN with 0s
    df = df.fillna(0)

    # Add target to data frame
    email_label = pd.read_csv(target_file , sep=" ", header=None, names = ["target"])
    df['target'] = list(email_label['target']) 
    return df

In [19]:
def naive_bayes(train_set, test_set, features):
    # Instantiate the classifier
    gnb = skl.naive_bayes.MultinomialNB()
    # Train classifier
    gnb.fit(train_set[features].values, train_set["target"])
    # Predict
    y_pred = gnb.predict(test_set[features])
    
    # Report results
    n_points = test_set.shape[0]
    inaccurates = (test_set["target"] != y_pred).sum()
    print("Naive Bayes Results: ")
    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(n_points, inaccurates, 100*(1-inaccurates / n_points)))

In [9]:
def decision_tree(train_set, test_set, features):
    # Instantiate the classifier
    tree = skl.tree.DecisionTreeClassifier(criterion = "entropy")
    # Train classifier
    tree.fit(train_set[features].values, train_set['target'])
    # Predict
    y_pred = tree.predict(test_set[features])
    
    # Report results
    n_points = test_set.shape[0]
    inaccurates = (test_set["target"] != y_pred).sum()
    print("Decision Tree Results: ")
    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(n_points, inaccurates, 100*(1-inaccurates / n_points)))

In [27]:
def knn(train_set, test_set, features): 
    # Instantiate the classifier
    k_neighbors= skl.neighbors.KNeighborsClassifier(n_neighbors=1)
    # Train classifier
    k_neighbors.fit(train_set[features].values, train_set['target'])
    # Predict
    y_pred = k_neighbors.predict(test_set[features])
    
    # Report results
    n_points = test_set.shape[0]
    inaccurates = (test_set["target"] != y_pred).sum()
    print("K Neighbors Results: ")
    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(n_points, inaccurates, 100*(1-inaccurates / n_points)))
    print('accuracy_score: ', skl.metrics.accuracy_score(test_set["target"], y_pred,normalize=True))
    print('recall_score: ', skl.metrics.recall_score(test_set["target"], y_pred))
    print('precision_score: ', skl.metrics.precision_score(test_set["target"], y_pred))
    print('F_score: ', skl.metrics.f1_score(test_set["target"], y_pred))
    print('roc_curve: ', skl.metrics.roc_curve(test_set["target"], y_pred))
    print('confusion_matrix: ', skl.metrics.confusion_matrix(test_set["target"], y_pred))

In [28]:
def main():
    train_df = create_df(FEATURE_FILE_NAME_TRAIN, TARGET_FILE_NAME_TRAIN, N_COL, N_ROW_TRAIN)
    test_df = create_df(FEATURE_FILE_NAME_TEST, TARGET_FILE_NAME_TEST, N_COL, N_ROW_TEST)
    features = list(range(1, N_COL + 1)) # every word as a feature

    naive_bayes(train_df, test_df, features)
    decision_tree(train_df, test_df, features)
    knn(train_df, test_df, features)
    
if __name__ == "__main__":
    main()

Naive Bayes Results: 
Number of mislabeled points out of a total 260 points : 5, performance 98.08%
Decision Tree Results: 
Number of mislabeled points out of a total 260 points : 14, performance 94.62%
K Neighbors Results: 
Number of mislabeled points out of a total 260 points : 16, performance 93.85%
accuracy_score:  0.9384615384615385
recall_score:  0.9307692307692308
precision_score:  0.9453125
F_score:  0.937984496124031
roc_curve:  (array([0.        , 0.05384615, 1.        ]), array([0.        , 0.93076923, 1.        ]), array([2, 1, 0]))
confusion_matrix:  [[123   7]
 [  9 121]]


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2492,2493,2494,2495,2496,2497,2498,2499,2500,target
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
