In [86]:
import IPython
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
import sklearn as skl
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [87]:
# Constants
FEATURE_FILE_NAME_TRAIN = 'Data/train-features.txt'
TARGET_FILE_NAME_TRAIN = 'Data/train-labels.txt'

FEATURE_FILE_NAME_TEST = 'Data/test-features.txt'
TARGET_FILE_NAME_TEST = 'Data/test-labels.txt'

N_COL = 2500 # number of unique words from dataset

# number of data entries
N_ROW_TRAIN = 700 
N_ROW_TEST = 260

In [88]:
def get_df(feature_file, target_file, n_col, n_row):  
    # Initialze data frame
    df = pd.DataFrame(columns=range(1,n_col + 1), index=range(1, n_row + 1))
    # Import data and populate data frame
    with open(feature_file) as f:
        for _, line in enumerate(f):
            line = line.split(' ')  # each line in file
            doc = int(line[0])
            word_id = int(line[1])
            freq = int(line[2])     
            df[word_id][doc] = freq
    # Replace NaN with 0s
    df = df.fillna(0)

    # Add target to data frame
    email_label = pd.read_csv(target_file , sep=" ", header=None, names = ["target"])
    df['target'] = list(email_label['target']) 
    return df

In [89]:
def naive_bayes(train_set, test_set, features):
    # Instantiate the classifier
    gnb = skl.naive_bayes.MultinomialNB()
    # Train classifier
    gnb.fit(
        train_set[features].values,
        train_set["target"]
    )
    y_pred = gnb.predict(test_set[features])
    
    # Report results
    n_points = test_set.shape[0]
    inaccurates = (test_set["target"] != y_pred).sum()
    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(n_points, inaccurates, 100*(1-inaccurates / n_points)))

In [90]:
def main():
    train_df = get_df(FEATURE_FILE_NAME_TRAIN, TARGET_FILE_NAME_TRAIN, N_COL, N_ROW_TRAIN)
    test_df = get_df(FEATURE_FILE_NAME_TEST, TARGET_FILE_NAME_TEST, N_COL, N_ROW_TEST)
    features = list(range(1, N_COL + 1)) # every word as a feature

    naive_bayes(train_df, test_df, features)
    
    
if __name__ == "__main__":
    main()

Number of mislabeled points out of a total 260 points : 5, performance 98.08%
