# Importing Packages

In [326]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from stemming.porter2 import stem
import glob
from random import shuffle
import pandas as pd
import re
import operator
import numpy

In [413]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from math import floor
import numpy as np
from collections import Counter

In [3]:
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

# General Functions

In [698]:
def randomize_files(file_list):
    shuffle(file_list)

In [699]:
def get_training_and_testing_sets(file_list, split):

    split_index = floor(len(file_list) * split)
    training = file_list[:split_index]
    testing = file_list[split_index:]
    return training, testing

In [700]:
def reading_files_to_list(file_name_list):
    storing_list = list()
    for file in file_name_list:                      # iterate over the list getting each file 
        with open(file, encoding="latin-1") as f:
            text = f.read()
            storing_list.append(text)
    return storing_list

In [701]:
# Function to return correct positional argument as per wordnet for lemmatiser function in the next block     
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [702]:
def cleaning_stemming(email_list):
    # Removing special cahracters like !,@, ., %, etc and keeping only alphabets and numbers
    clean_email = list()
    for email in email_list:
        string = email
        string = re.sub("\.(?!$\d)",'', str(string))
        string = re.sub("[^A-Za-z0-9]+", ' ', str(string))
        clean_email.append(string)

    tokenized_email = list()
    for email in clean_email:
        tokens = word_tokenize(email)
        tokens_pos = pos_tag(tokens)
        tokenized_email.append(tokens_pos)
        
    email_lemma = list()
    for token in tokenized_email:
        temp_word = list()
        for word in token:
            try:
                temp_word.append(lemmatiser.lemmatize(word[0].lower(), pos = get_wordnet_pos(word[1]))) #used the above function
            except:
                continue
        email_lemma.append(temp_word)
    return email_lemma

In [703]:
def test_email_str(test_clean):
    temp_list = []
    for item in test_clean:
        temp_str = ' '.join(item)
        temp_list.append(temp_str)
    return temp_list

In [704]:
def bag_of_words(email_lemma):
    clean_dict = {}
    for i in email_lemma:
        for word in i:
            if word not in clean_dict:
                clean_dict[word] = 1
            else:
                clean_dict[word] = clean_dict[word] + 1
    
    word_bag = {}
    for i in sorted(clean_dict.keys()):
        word_bag[i] = clean_dict[i]
    return word_bag

In [705]:
def probab_word_given_spam_or_ham(email_words, word_bag):
    probability = 1.0
    for word in email_words:
        if word in list(set(word_bag)):
            numerator = word_bag[word] + 1           # 1 is Laplace Smoothing
            denominator = sum(word_bag.values()) + len(word_bag.keys())    #Again, Laplace Smoothing
            prob_temp = float(numerator) / float(denominator)
            probability = probability * prob_temp
        else:
            numerator = 1
            denominator = sum(word_bag.values()) + len(word_bag.keys())    #Again, Laplace Smoothing
            prob_temp = float(numerator) / float(denominator)
            probability = probability * prob_temp
    return probability

In [706]:
def probab_word(email_words):
    probability = 1.0
    denominator = len(email_words)
    for word in list(set(email_words)):
        numerator = email_words.count(word)
        prob_temp = float(numerator) + 1 / float(denominator)
        probability = probability * prob_temp
    return probability
    

In [707]:
def spam_given_email(email_words, spam_word_bag, spam_train_list, ham_train_list):
    probab_spam = float(len(spam_train_list) / (len(spam_train_list) + len(ham_train_list)))
    probability = (probab_word_given_spam_or_ham(email_words, spam_word_bag) * probab_spam) / probab_word(email_words)
    return probability
    

In [708]:
def ham_given_email(email_words, ham_word_bag, spam_train_list, ham_train_list):
    probab_ham = float(len(ham_train_list) / (len(spam_train_list) + len(ham_train_list)))
    probability = (probab_word_given_spam_or_ham(email_words, ham_word_bag) * probab_ham) / probab_word(email_words)
    return probability
    

In [709]:
def classifier(email_words, spam_word_bag, ham_word_bag, spam_train_list, ham_train_list):
    p_Spam = spam_given_email(email_words, spam_word_bag, spam_train_list, ham_train_list)
    p_Ham = ham_given_email(email_words, ham_word_bag, spam_train_list, ham_train_list)
    
    if p_Spam > p_Ham:
        return 1
    else:
        return 0

In [741]:
# Reading the SPAM email files and dividing it into the train and test

path = '/Users/sumeetkotaria/Desktop/Machine Learning/HW1/Dataset/spam/*.txt'
spam_files = glob.glob(path)
randomize_files(spam_files)
spam_train_file, spam_test_file = get_training_and_testing_sets(spam_files,0.75)    #Spliting SPAM files to Train and Test

# Reading spam emails to list
spam_train_list = reading_files_to_list(spam_train_file)
spam_test_list = reading_files_to_list(spam_test_file)

# Reading the HAM email files and dividing it into the train and test
path = '/Users/sumeetkotaria/Desktop/Machine Learning/HW1/Dataset/ham/*.txt'
ham_files = glob.glob(path)
randomize_files(ham_files)
ham_train_file, ham_test_file = get_training_and_testing_sets(ham_files,0.75)    #Splitting HAM files to Train and Test

# Reading ham emails to list
ham_train_list = reading_files_to_list(ham_train_file)
ham_test_list = reading_files_to_list(ham_test_file)


# Setting test data
spam_test_clean = cleaning_stemming(spam_test_list)
ham_test_clean = cleaning_stemming(ham_test_list)

spam_test_str = test_email_str(spam_test_clean)
ham_test_str = test_email_str(ham_test_clean)

# Cleaning train_data
spam_train_clean = cleaning_stemming(spam_train_list)
ham_train_clean = cleaning_stemming(ham_train_list)


# Bag of words for train and test
spam_word_bag = bag_of_words(spam_train_clean)
ham_word_bag = bag_of_words(ham_train_clean)

df_spam_test = pd.DataFrame(spam_test_str, columns = ['Email'])
df_spam_test['Actual Spam / No Spam'] = 1
df_spam_test['predict'] = 0
df_ham_test = pd.DataFrame(ham_test_str, columns = ['Email'])
df_ham_test['Actual Spam / No Spam'] = 0
df_ham_test['predict'] = 0
   

In [711]:
test_data = pd.concat([df_spam_test,df_ham_test], ignore_index = True)

In [712]:
for index, row in test_data.iterrows():
    email_str = row['Email']
    email_words = email_str.split()
    test_data['predict'][index] = classifier(email_words, spam_word_bag, ham_word_bag, spam_train_list, ham_train_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [713]:
a = np.sum(test_data['Actual Spam / No Spam'] == test_data['predict'])
b = np.sum(test_data['Actual Spam / No Spam'] != test_data['predict'])
a/(a+b)

0.8739365815931941

### The accaracy of Naive Bayes Classifer is 87.39%

# Nearest Neighbour

In [742]:
def vectorize(email_list, dictionary):     #email_list is list of list of words in email
    vector = []
    for email in email_list:
        email_vector = []
        for key in dictionary:
            temp = email.count(dictionary[key])
            email_vector.append(temp)
        vector.append(email_vector)
    
    return vector
    

In [743]:
def creating_dictionary(clean_train_list):
    
    list_of_words = [item for subitem in clean_train_list for item in subitem]
    dict_temp = list(set(list_of_words))
    main_dictionary = dict(enumerate(dict_temp))
    
    return main_dictionary

In [744]:
train_list = spam_train_list + ham_train_list
clean_train_list = cleaning_stemming(train_list)

In [745]:
dictionary = creating_dictionary(clean_train_list)

In [746]:
dictionary

{0: 'undermine',
 1: 'tentative',
 2: 'bdtydnd',
 3: 'vor',
 4: 'horton',
 5: 'roose',
 6: 'quetzal',
 7: 'granada',
 8: 'agitation',
 9: 'ejya',
 10: 'bassinet',
 11: 'l',
 12: 'orebody',
 13: 'prose',
 14: 'hdr',
 15: 'thereto',
 16: 'colorate',
 17: 'polach',
 18: 'creosote',
 19: 'pieasantly',
 20: 'consummate',
 21: 'hesitate',
 22: 'spill',
 23: 'iekouqi',
 24: 'marsh',
 25: 'lesson',
 26: 'sullivan',
 27: 'fly',
 28: 'wigging',
 29: 'shake',
 30: 'dawn',
 31: 'ameliorate',
 32: 'ple',
 33: 'klement',
 34: 'pre',
 35: 'banking',
 36: 'cecilia',
 37: 'unitholders',
 38: 'colloidal',
 39: 'pull',
 40: 'sri',
 41: 'ocean',
 42: 'vega',
 43: 'passionvalhalla',
 44: 'nico',
 45: 'uscriabin',
 46: 'orig',
 47: 'potter',
 48: 'wwwkbhbbbzn',
 49: 'jepson',
 50: 'wean',
 51: 'stock',
 52: 'bring',
 53: 'palindrome',
 54: 'asteroidal',
 55: 'dvd',
 56: 'ppo',
 57: 'hdp',
 58: 'wbukxoirements',
 59: 'symbo',
 60: 'tkm',
 61: 'dissolve',
 62: 'jlnyjxfp',
 63: 'basically',
 64: 'diving',
 65:

In [747]:
spam_train_vector = vectorize(spam_train_clean, dictionary)
ham_train_vector = vectorize(ham_train_clean, dictionary)

In [748]:
print(len(spam_train_vector[0]), len(spam_train_vector), len(ham_train_vector))

33579 1125 2754


In [749]:
def tag_and_train(spam_vector, ham_vector):
    
    for vector in spam_vector:
        vector.append(1)
    for vector in ham_vector:
        vector.append(0)

    train_vector = spam_vector + ham_vector
    
    return train_vector

In [750]:
train_vector = tag_and_train(spam_train_vector, ham_train_vector)

In [751]:
train_vector1 = np.array(train_vector)

In [752]:
print(len(train_vector[0]), len(train_vector))

33580 3879


In [753]:
spam_test_vector = vectorize(spam_test_clean, dictionary)
ham_test_vector = vectorize(ham_test_clean, dictionary)

In [754]:
print(len(spam_test_vector[0]), len(spam_test_vector), len(ham_test_vector))

33579 375 918


In [755]:
test_vector = tag_and_train(spam_test_vector, ham_test_vector)

In [756]:
test_vector1 = np.array(test_vector)

In [757]:
sum(test_vector1[0])

157

In [758]:
print(len(test_vector1[0]), len(test_vector1))

33580 1293


In [759]:
def euclidean_distance(train_vector_instance, test_vector_instance, norm):
    distance = np.linalg.norm(train_vector_instance[:-1]-test_vector_instance[:-1], ord=norm)
    return distance  

In [760]:
def get_neighbors(train_vector, test_vector_instance, k, norm):
    distance = []
    norm = norm
    for x in range(len(train_vector)):
        dist = euclidean_distance(train_vector[x], test_vector_instance, norm)
        distance.append((train_vector[x], dist))
    distance.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distance[x][0])
    return neighbors

In [761]:
def get_response(neighbors):
    class_counter = Counter()
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        class_counter[response] += 1        
        return class_counter.most_common(1)[0][0]


In [762]:
def accuracy_NN(test_vector, prediction):
    correct = 0
    for x in range(len(test_vector)):
        if test_vector[x][-1] == prediction[x]:
            correct += 1
    return ((correct / float(len(test_vector))) * 100.0)
            

In [763]:
prediction = []
k = 5
norm = 2
for x in range(len(test_vector1)):
    neighbors = get_neighbors(train_vector1, test_vector1[x], k, norm)
    result = get_response(neighbors)
    prediction.append(result)
accuracy = accuracy_NN(test_vector1, prediction)
print('The accuarcy for '+ str(k) + '-NN is ' + str(accuracy))


The accuarcy for 5-NN is 89.7138437741686


In [764]:
prediction1 = []
k = 5
norm = np.inf
for x in range(len(test_vector1)):
    neighbors = get_neighbors(train_vector1, test_vector1[x], k, norm)
    result = get_response(neighbors)
    prediction1.append(result)
accuracy_3 = accuracy_NN(test_vector1, prediction1)
print('The accuarcy for '+ str(k) + '-NN is ' + str(accuracy_3))

The accuarcy for 5-NN is 78.11291569992265


In [765]:
prediction1 = []
k = 5
norm = 1
for x in range(len(test_vector1)):
    neighbors = get_neighbors(train_vector1, test_vector1[x], k, norm)
    result = get_response(neighbors)
    prediction1.append(result)
accuracy_3 = accuracy_NN(test_vector1, prediction1)
print('The accuarcy for '+ str(k) + '-NN is ' + str(accuracy_3))

The accuarcy for 5-NN is 80.81979891724671


In [766]:
prediction = []
k = 10
norm = 2
for x in range(len(test_vector1)):
    neighbors = get_neighbors(train_vector1, test_vector1[x], k, norm)
    result = get_response(neighbors)
    prediction.append(result)
accuracy = accuracy_NN(test_vector1, prediction)
print('The accuarcy for '+ str(k) + '-NN is ' + str(accuracy))

The accuarcy for 10-NN is 89.7138437741686


In [767]:
prediction = []
k = 10
norm = 1
for x in range(len(test_vector1)):
    neighbors = get_neighbors(train_vector1, test_vector1[x], k, norm)
    result = get_response(neighbors)
    prediction.append(result)
accuracy = accuracy_NN(test_vector1, prediction)
print('The accuarcy for '+ str(k) + '-NN is ' + str(accuracy))

The accuarcy for 10-NN is 80.81979891724671


In [768]:
prediction = []
k = 10
norm = np.inf
for x in range(len(test_vector1)):
    neighbors = get_neighbors(train_vector1, test_vector1[x], k, norm)
    result = get_response(neighbors)
    prediction.append(result)
accuracy = accuracy_NN(test_vector1, prediction)
print('The accuarcy for '+ str(k) + '-NN is ' + str(accuracy))

The accuarcy for 10-NN is 78.11291569992265


# KD TREE

In [383]:
def magnitude(vector_list, norm = 2):
    magnitude_v = []
    for vector in vector_list:
        temp = [numpy.linalg.norm(vector[:-1], ord=norm), vector[-1]]
        magnitude_v.append(temp) 
    magnitude_v.sort(key=operator.itemgetter(0))
    
    return magnitude_v

In [384]:
normed_train = magnitude(train_vector, norm=2)
normed_test = magnitude(test_vector, norm=2)

In [382]:
def kdtree(train_vector_norm, test_vector,k):
    median = int(len(train_vector_norm) / 2)
    if len(train_vector_norm) == k :
        neighbor = train_vector_norm
        return neighbor
    else:
        if test_vector[0] < train_vector_norm[median][0]:
            neighbor = kdtree(train_vector_norm[:median],test_vector,k)
            return neighbor
        elif test_vector[0] >= train_vector_norm[median][0]:
            neighbor = kdtree(train_vector_norm[median:],test_vector,k)
            return neighbor
    


In [385]:
def get_response(neighbor):
    response_label = {}
    for x in range(len(neighbor)):
        response = neighbor[x][-1]
        if response in response_label:
            response_label[response] += 1
        else:
            response_label[response] = 1
    sorted_label = sorted(response_label.items(), key = operator.itemgetter(1), reverse = True)
    return sorted_label[0][0]

In [386]:
def accuracy_NN(test_vector, prediction):
    correct = 0
    for x in range(len(test_vector)):
        if test_vector[x][-1] == prediction[x]:
            correct += 1
    return ((correct / float(len(test_vector))) * 100.0)

In [397]:
prediction = []
k = 1
norm = 2
for x in normed_test:
    neighbors = kdtree(normed_train, x, k)
    result = get_response(neighbors)
    prediction.append(result)
accuracy = accuracy_NN(test_vector, prediction)
print('The accuarcy for '+ str(k) + '-NN is' + str(accuracy))

The accuarcy for 1-NN is66.3768115942029


In [392]:
normed_test[0][0]

1.4142135623730951

In [394]:
normed_train[123][0]

2.8284271247461903

In [399]:
l = np.array([1,2,5,1])
b = np.array([1,2,3,4])

a = numpy.linalg.norm(l-b)

In [401]:
l = [1,2,3,4]
l[:-1]

[1, 2, 3]

# DECISION TREE

In [667]:
def word_bag(email_lemma, most_common):
    x = most_common
    clean_dict = {}
    for i in email_lemma:
        for word in i:
            if word not in clean_dict:
                clean_dict[word] = 1
            else:
                clean_dict[word] = clean_dict[word] + 1
    
    word_bag = {}
    count_dict = Counter(clean_dict)
    count_dict = count_dict.most_common(x)
    count_dict.sort(key=operator.itemgetter(0))
    for i in count_dict:
        word_bag[i[0]] = i[1]
    return word_bag

In [668]:
'''word_bag is dictioanry of words as keys and their occurence as their value'''
word_bag = word_bag(train_clean, 10000)  #original word bag with no most_common is 36345

In [669]:
def vectorize(email_list, word_bag):     #email_list is list of list of words in meail
    vector = []
    for email in email_list:
        email_vector = []
        for key in word_bag:
            temp = email.count(key)
            email_vector.append(temp)
        vector.append(email_vector)
    
    return vector

In [670]:
spam_train_vector = vectorize(spam_train_clean, word_bag)
ham_train_vector = vectorize(ham_train_clean, word_bag)

In [671]:
def tag_and_train(spam_vector, ham_vector):
    
    for vector in spam_vector:
        vector.append(1)
    for vector in ham_vector:
        vector.append(0)

    train_vector = spam_vector + ham_vector
    
    return train_vector

In [672]:
train_vector = tag_and_train(spam_train_vector, ham_train_vector)

In [673]:
spam_test_vector = vectorize(spam_test_clean, word_bag)
ham_test_vector = vectorize(ham_test_clean, word_bag)

In [674]:
test_vector = tag_and_train(spam_test_vector, ham_test_vector)

In [675]:
features = list(word_bag.keys())

In [676]:
def unique_vals(rows, col):
    """Find the unique values for a column in a dataset."""
    return set([row[col] for row in rows])

In [677]:
def class_counts(rows):
    """Counts the number of each type of example in a dataset."""
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [678]:
class Question:
    """A Question is used to partition a dataset."""

    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        return val >= self.value
        
    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = ">="
        return "Is %s %s %s?" % (
            features[self.column], condition, str(self.value))

In [679]:
t= Question(1,1)

In [680]:
t.match(train_vector[0])

False

In [681]:
def partition(rows, question):
    """Partitions a dataset.

    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [682]:
true_rows, false_rows = partition(train_vector, Question(3, 1))

In [683]:
len(true_rows)

5

In [684]:
def gini(rows):
    """Calculate the Gini Impurity for a list of rows."""
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [685]:
current_uncertainty = gini(train_vector)
current_uncertainty

0.41185481381606737

In [686]:
def info_gain(left, right, current_uncertainty):
    """Information Gain.

    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [687]:
true_rows, false_rows = partition(train_vector, Question(8, 2))
info_gain(true_rows, false_rows, current_uncertainty)

0.0

In [688]:
def find_best_split(rows):
    """Find the best question to ask by iterating over every feature / value
    and calculating the information gain."""
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [689]:
best_gain, best_question = find_best_split(train_vector)
best_question

Is enron >= 1?

In [690]:
class Leaf:
    """A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Spam(1)") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [691]:
class Decision_Node:
    """A Decision Node asks a question.

    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [692]:
def build_tree(rows):
    """Builds the tree."""

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows)

    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    return Decision_Node(question, true_branch, false_branch)

In [693]:
decision_tree = build_tree(train_vector)

In [694]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [695]:
classify(train_vector[5], decision_tree)

{1: 1}

In [696]:
def accuracy_DT(test_vector, prediction):
    correct = 0
    for x in range(len(test_vector)):
        if test_vector[x][-1] == prediction[x]:
            correct += 1
    return ((correct / float(len(test_vector))) * 100.0)

In [666]:
prediction = []
for row in test_vector:
    pred_dict = classify(row, decision_tree)
    prediction_value = max(pred_dict.items(), key=operator.itemgetter(1))[0]
    prediction.append(prediction_value)
accuracy = accuracy_DT(test_vector, prediction)
print('The accuarcy for Decision Tree with 10 feature size is '+ str(accuracy))
    

The accuarcy for Decision Tree with 10 feature size is 79.5169082125604


In [697]:
prediction1 = []
for row in test_vector:
    pred_dict = classify(row, decision_tree)
    prediction_value = max(pred_dict.items(), key=operator.itemgetter(1))[0]
    prediction1.append(prediction_value)
accuracy = accuracy_DT(test_vector, prediction1)
print('The accuarcy for Decision Tree for 10,000 feature is '+ str(accuracy))

The accuarcy for Decision Tree for 10,000 feature is 95.65217391304348
