# Note
In order to make the code work paste the FOLLOWING files provided in the aima-data folder attached in the zip file to your AIMA aima-data folder

* haberman.csv
* audit_risk_data.csv
* audit_risk_data_test.csv
* nursery_test.csv
* nursery.csv

In [234]:
import os
import sys

# configuring to  access the  AIMA repo and access some extras
if sys.platform == "win32":
    module_path = os.path.abspath(os.path.join('..\..'))
    ExtraFolderPath = os.path.abspath(os.path.join('..\EXTRA_DIR'))
else:
    module_path = os.path.abspath(os.path.join('../..'))
    ExtraFolderPath = os.path.abspath(os.path.join('../EXTRA_DIR'))
    
if module_path not in sys.path:
    sys.path.append(module_path)
    sys.path.append(ExtraFolderPath)
sys.path.insert(1, module_path) 
sys.path.insert(2, module_path)

In [235]:
# import the required libraries from AIMA and other sources

from probability import *
from utils import print_table
from notebook import psource, pseudocode, heatmap
from learning import *
from collections import defaultdict

# 1.BASICS

## 1.1 Probability Distribution Table 

In [236]:
P_Tip1=ProbDist(varname='Tip1',freqs={'Never':1,'Rarely':4, 'Sometimes':6, 'Often':12, 'Always':23})
P_Tip2=ProbDist(varname='Tip2',freqs={'Never':12,'Rarely':4, 'Sometimes':12, 'Often':4, 'Always':2})
P_Tip3=ProbDist(varname='Tip3',freqs={'Never':24,'Rarely':2, 'Sometimes':5, 'Often':4, 'Always':4})

In [237]:
P_Tip1.values

['Never', 'Rarely', 'Sometimes', 'Often', 'Always']

In [238]:
# variables={ 'Tip1':P_Tip1,'Tip2':P_Tip2,'Tip3':P_Tip3 }

# for variable_name,Prob_obj in variables.items():
    
#     for freq in Prob_obj.values:
        
#         print('Probability Distribution for {0} for Freq: {1} is {2}'.format(variable_name,freq,Prob_obj[freq]))
        
#     print('\n')

In [239]:
variables={ 'Tip1':P_Tip1,
           'Tip2':P_Tip2,'Tip3':P_Tip3 }
prob_dist_table=[]
for variable_name,Prob_obj in variables.items():
    prob_dist_for_this_feature=[]
    prob_dist_for_this_feature.append(variable_name)
#     print('Variable Name:',variable_name)
#     print('Variable Name from list:',prob_dist_for_this_feature[0])
#     print('Prob_obj.values:',Prob_obj.values)
    
    for freq in Prob_obj.values:        
        prob_dist_for_this_feature.append(Prob_obj[freq])
#     print('This feature:',prob_dist_for_this_feature)
    
    prob_dist_table.append(prob_dist_for_this_feature)    
print(prob_dist_table)

[['Tip1', 0.021739130434782608, 0.08695652173913043, 0.13043478260869565, 0.2608695652173913, 0.5], ['Tip2', 0.35294117647058826, 0.11764705882352941, 0.35294117647058826, 0.11764705882352941, 0.058823529411764705], ['Tip3', 0.6153846153846154, 0.05128205128205128, 0.1282051282051282, 0.10256410256410256, 0.10256410256410256]]


In [240]:
P_Tip1.values[0]='   '
P_Tip1.values

['   ', 'Rarely', 'Sometimes', 'Often', 'Always']

In [241]:

print_table(table=prob_dist_table, header=P_Tip1.values, sep='   ', numfmt='{:.2f}')

       Rarely   Sometimes   Often   Always
Tip1     0.02        0.09    0.13     0.26
Tip2     0.35        0.12    0.35     0.12
Tip3     0.62        0.05    0.13     0.10


## 1.2 Bayesian Networks 

## Rationale behind the world view 

## Visual Depiction of the Bayesian Network and associated probabilities


<img src="https://www.dropbox.com/s/s1tupltugjdulwe/BayesNet.PNG?raw=1" alt="data1" border="0">

## Generating the node objects for every random variable as in above diagram

In [242]:
AI_node        =   BayesNode('AI', '', 0.2)
employed_node =    BayesNode('Employed', '', 0.70)
fossil_fuel_node = BayesNode('FossilFuel', '', 0.70)
renewables_node =  BayesNode('Renewables', '', 0.40)


traffic_node = BayesNode('Traffic', ['AI', 'Employed'], 
                       {(True, True): 0.65,(True, False): 0.30, (False, True): 0.95, (False, False): 0.35})

global_warming_node = BayesNode('Global Warming', ['FossilFuel', 'Traffic', 'Renewables'], 
                       {(True,True, True): 0.80,(True,True, False): 0.98, (True,False, True): 0.04, (True,False, False): 0.70,
                        (False,True, True): 0.30,(False,True, False): 0.45, (False,False, True): 0.20, (False,False, False): 0.10
                       })

In [243]:
global_warming_node.p(False, {'FossilFuel':True, 'Traffic':True, 'Renewables':False, 'AI':True,'Employed':False})

0.020000000000000018

In [244]:
AI_node.p(False,{})

0.8

## Creating a Bayesian Network based on the probabilities from the parents nodes.

In [245]:
# T, F = True, False

world = BayesNet([
    ('AI', '', 0.2),
    ('Employed', '', 0.7),
    ('FossilFuel', '', 0.7),
    ('Renewables', '', 0.7),
    ('Traffic', ['AI', 'Employed'], {(True, True): 0.65,(True, False): 0.30, (False, True): 0.95, (False, False): 0.35}),
                       
    
    ('Global Warming', ['FossilFuel', 'Traffic', 'Renewables'], {(True,True, True): 0.80,(True,True, False): 0.98, (True,False, True): 0.04, (True,False, False): 0.70,
                                                                (False,True, True): 0.30,(False,True, False): 0.45, (False,False, True): 0.20, (False,False, False): 0.10
                                                               })
                                                               
])

## Below is how the world view bayesian network look like, similar to our diagram

In [246]:
print(world)

BayesNet([('AI', ''), ('Employed', ''), ('FossilFuel', ''), ('Renewables', ''), ('Traffic', 'AI Employed'), ('Global Warming', 'FossilFuel Traffic Renewables')])


## Querying the Network
* Here we are querying the network to provide the probability of `Global Warming` in an ideal world where 
    there will be no use of fossil fuels for energy generation, more renewable energy and less traffic.
* The network says it shall reduce Global warming drastically. :)


In [247]:
ans_dist = enumeration_ask('Global Warming', {'FossilFuels': False, 'Traffic': False, 'Renewables':True}, world)
ans_dist[True]

0.08800000000000002

# 2.NAIVE BAYES
## 2.1 DATA
* Below data is from the UCI Repo. This data is about the applications for Nursery Schools in United States. The attributes deal with the various factors such as Family Social and Financial background, health etc.


# Data Set 1

### Importing the Dataset

In [248]:
attribute_names=['parents','has_nurs','form','children','housing' ,'finance' ,'social','health','target']

In [249]:
nursery_dataset= DataSet(examples=None, attrs=range(9), attrnames=attribute_names, target=-1,
                 inputs=None, values=None, distance=mean_boolean_error,
                 name='nursery', source='', exclude=())


print(nursery_dataset)

<DataSet(nursery): 1000 examples, 9 attributes>


## Compute Prior  Probability of each Class:

Turning priority ==> 1 and Non Recommended ==> 0

### Removing two minority classes from the target collumn and turning the target into binary

In [250]:
nursery_dataset.remove_examples("recommend")
nursery_dataset.remove_examples("very_recom")

print('Target Classes',nursery_dataset.values[nursery_dataset.target])

print("Class of first example:",nursery_dataset.examples[14][nursery_dataset.target])
nursery_dataset.classes_to_numbers()
print("Class of first example:",nursery_dataset.examples[14][nursery_dataset.target])


Target Classes ['priority', 'not_recom']
Class of first example: priority
Class of first example: 1


### Calculating the target distribution

In [251]:
target_dictionary=defaultdict(int)
def add(o):   
    target_dictionary[o] += 1


# Finding the target Distribution
for example in nursery_dataset.examples:
    targetval = example[nursery_dataset.target]
    target_dictionary[targetval] += 1
print(target_dictionary)

# calculating the total number of observations
number_of_observations=0
for count in target_dictionary.values():
    number_of_observations+=count 
    
print('number_of_observations',number_of_observations)

defaultdict(<class 'int'>, {1: 579, 0: 333})
number_of_observations 912


## Prior Probability of Priority

In [252]:
priority=1
non_recommended=0
print(target_dictionary[priority]/number_of_observations*100,'Percent')

63.48684210526315 Percent


## Prior Probability of Not Recommended

In [253]:
print(target_dictionary[non_recommended]/number_of_observations*100,'Percent')

36.51315789473684 Percent


# Calculating Probability of Evidence

In [254]:
# This method put the values given into different buckets and returns a dictionary
def create_bucket(values):
    bucket=defaultdict(int)
    for val in values:
        bucket[val]+=1
    return bucket

# A helper function to print probabilities
def print_probabilites(target_dictionary):
    number_of_observations=0
    for count in target_dictionary.values():
        number_of_observations+=count 
    for key in target_dictionary.keys():
        print('Probability of ',key,' is ',target_dictionary[key]/number_of_observations*100,'Percent')



In [255]:
target_vals = nursery_dataset.values[0]
for attribute_name, attribute_index in zip(nursery_dataset.attrnames,nursery_dataset.inputs):
    print('Attribute Name: ',attribute_name)
    target_v =[example[attribute_index] for example in nursery_dataset.examples]
    attribute_dict=create_bucket(target_v)
    print(attribute_dict)
    print_probabilites(attribute_dict)
    print('\n\n')



Attribute Name:  parents
defaultdict(<class 'int'>, {'usual': 912})
Probability of  usual  is  100.0 Percent



Attribute Name:  has_nurs
defaultdict(<class 'int'>, {'proper': 798, 'less_proper': 114})
Probability of  proper  is  87.5 Percent
Probability of  less_proper  is  12.5 Percent



Attribute Name:  form
defaultdict(<class 'int'>, {'complete': 306, 'completed': 196, 'incomplete': 202, 'foster': 208})
Probability of  complete  is  33.55263157894737 Percent
Probability of  completed  is  21.49122807017544 Percent
Probability of  incomplete  is  22.149122807017545 Percent
Probability of  foster  is  22.807017543859647 Percent



Attribute Name:  children
defaultdict(<class 'int'>, {1: 228, 2: 242, 3: 234, 'more': 208})
Probability of  1  is  25.0 Percent
Probability of  2  is  26.535087719298247 Percent
Probability of  3  is  25.657894736842106 Percent
Probability of  more  is  22.807017543859647 Percent



Attribute Name:  housing
defaultdict(<class 'int'>, {'convenient': 290, 'l

# Probability of Likelihood of Evidence

In [256]:
target_dictionary=defaultdict(int)
def add(o):   
    target_dictionary[o] += 1
#     number_of_observations += 1

# target_dictionary=defaultdict(int)
number_of_observations=0
target_vals = nursery_dataset.values[nursery_dataset.target]

for example in nursery_dataset.examples:
    targetval = example[nursery_dataset.target]
    add(targetval)
    
print(target_dictionary)
number_of_observations=0
for count in target_dictionary.values():
    number_of_observations+=count 
    
print('number_of_observations',number_of_observations)

defaultdict(<class 'int'>, {1: 579, 0: 333})
number_of_observations 912


In [257]:
print(nursery_dataset.values[nursery_dataset.target])
print(nursery_dataset.inputs)
a=[example[1] for example in nursery_dataset.examples if example[nursery_dataset.target]==1]
print(len(a))

['priority', 'not_recom']
[0, 1, 2, 3, 4, 5, 6, 7]
579


In [258]:
# this method calculates the likelihood of evidence given the dataset object
def calculate_likekihood_ofEvidence(nursery_dataset):

    likelihood_evidence_dict=defaultdict(int)
    possible_target_class=[0, 1]

    # indexes of input features
    input_feature_indexes=nursery_dataset.inputs

    

    # For every feature index, filter the collumn based on the corresponding target class 
    # Put every filtered data point into respective bucket and return the dictionary of counts for each
    # feature class given the target class
    for i in input_feature_indexes:
        for c in possible_target_class:
            relevent_examples=[example[i] for example in nursery_dataset.examples if example[nursery_dataset.target]==c]

            for r_example in relevent_examples:
                
                likelihood_evidence_dict[(i,r_example,c)]+=1
                

    

    return likelihood_evidence_dict


In [259]:
calculate_likekihood_ofEvidence(nursery_dataset)


defaultdict(int,
            {(0, 'usual', 0): 333,
             (0, 'usual', 1): 579,
             (1, 'proper', 0): 288,
             (1, 'less_proper', 0): 45,
             (1, 'proper', 1): 510,
             (1, 'less_proper', 1): 69,
             (2, 'complete', 0): 117,
             (2, 'completed', 0): 72,
             (2, 'incomplete', 0): 72,
             (2, 'foster', 0): 72,
             (2, 'complete', 1): 189,
             (2, 'completed', 1): 124,
             (2, 'incomplete', 1): 130,
             (2, 'foster', 1): 136,
             (3, 1, 0): 90,
             (3, 2, 0): 90,
             (3, 3, 0): 81,
             (3, 'more', 0): 72,
             (3, 1, 1): 138,
             (3, 2, 1): 152,
             (3, 3, 1): 153,
             (3, 'more', 1): 136,
             (4, 'convenient', 0): 114,
             (4, 'less_conv', 0): 111,
             (4, 'critical', 0): 108,
             (4, 'convenient', 1): 176,
             (4, 'less_conv', 1): 195,
             (4, 'critic

## Time to calculate the Likelihood of evidence
### Configure the variables `input_index`  `input_class` `target_class` to find the respective likelihood

In [260]:
likelihood_dict=calculate_likekihood_ofEvidence(nursery_dataset)
target_bucket=create_bucket( [example[nursery_dataset.target] for example in nursery_dataset.examples])
print(target_bucket)

# Index number of the input feature
input_index=1

# Input feature class for the above input feature
input_class='less_proper'

# Target class [0,1]
target_class=0

print('Numerator:',likelihood_dict[(input_index,input_class,target_class)])
print('Denominator: ',target_bucket[target_class])
likelihood=likelihood_dict[(input_index,input_class,target_class)] /target_bucket[target_class]
print('The Likelihood for Input Index:{0} Input Class:{1} Target Class:{2} is {3}'.format(input_index,input_class,target_class,likelihood))

defaultdict(<class 'int'>, {1: 579, 0: 333})
Numerator: 45
Denominator:  333
The Likelihood for Input Index:1 Input Class:less_proper Target Class:0 is 0.13513513513513514


# Data Set 2

### Importing the Dataset

In [261]:
attribute_names=['Age','Year','Nodes_Detected','status']

In [262]:
haberman_dataset= DataSet(examples=None, attrs=range(4), attrnames=attribute_names, target=-1,
                 inputs=None, values=None, distance=mean_boolean_error,
                 name='haberman', source='', exclude=())


print(haberman_dataset)

<DataSet(haberman): 306 examples, 4 attributes>


## Compute Prior  Probability of each Class:

Survived More than 5 years post surgery ==> 1 
Survived less than 5 years post surgery  ==> 2

### Calculating the target distribution

In [263]:
target_dictionary=defaultdict(int)
def add(o):   
    target_dictionary[o] += 1


# Finding the target Distribution
for example in haberman_dataset.examples:
    targetval = example[haberman_dataset.target]
    target_dictionary[targetval] += 1
print(target_dictionary)

# calculating the total number of observations
number_of_observations=0
for count in target_dictionary.values():
    number_of_observations+=count 
    
print('number_of_observations',number_of_observations)

defaultdict(<class 'int'>, {1: 225, 2: 81})
number_of_observations 306


###  Prior Probability of Survived More than 5 years

In [264]:
Survived=1
Not_survived=2
print(target_dictionary[Survived]/number_of_observations*100,'Percent')

73.52941176470588 Percent


### Prior Probability of Not Recommended

In [265]:
print(target_dictionary[Not_survived]/number_of_observations*100,'Percent')

26.47058823529412 Percent


# Calculating Probability of Evidence

In [266]:
# This method put the values given into different buckets and returns a dictionary
def create_bucket(values):
    bucket=defaultdict(int)
    for val in values:
        bucket[val]+=1
    return bucket

# A helper function to print probabilities
def print_probabilites(target_dictionary):
    number_of_observations=0
    for count in target_dictionary.values():
        number_of_observations+=count 
    for key in target_dictionary.keys():
        print('Probability of ',key,' is ',target_dictionary[key]/number_of_observations*100,'Percent')



In [267]:
target_vals = [1,2]
for attribute_name, attribute_index in zip(haberman_dataset.attrnames,haberman_dataset.inputs):
    print('Attribute Name: ',attribute_name)
    target_v =[example[attribute_index] for example in haberman_dataset.examples]
    attribute_dict=create_bucket(target_v)
    print(attribute_dict)
    print_probabilites(attribute_dict)
    print('\n\n')



Attribute Name:  Age
defaultdict(<class 'int'>, {30: 3, 31: 2, 33: 2, 34: 7, 35: 2, 36: 2, 37: 6, 38: 10, 39: 6, 40: 3, 41: 10, 42: 9, 43: 11, 44: 7, 45: 9, 46: 7, 47: 11, 48: 7, 49: 10, 50: 12, 51: 6, 52: 14, 53: 11, 54: 13, 55: 10, 56: 7, 57: 11, 58: 7, 59: 8, 60: 6, 61: 9, 62: 7, 63: 8, 64: 5, 65: 10, 66: 5, 67: 6, 68: 2, 69: 4, 70: 7, 71: 1, 72: 4, 73: 2, 74: 2, 75: 1, 76: 1, 77: 1, 78: 1, 83: 1})
Probability of  30  is  0.9803921568627451 Percent
Probability of  31  is  0.6535947712418301 Percent
Probability of  33  is  0.6535947712418301 Percent
Probability of  34  is  2.287581699346405 Percent
Probability of  35  is  0.6535947712418301 Percent
Probability of  36  is  0.6535947712418301 Percent
Probability of  37  is  1.9607843137254901 Percent
Probability of  38  is  3.2679738562091507 Percent
Probability of  39  is  1.9607843137254901 Percent
Probability of  40  is  0.9803921568627451 Percent
Probability of  41  is  3.2679738562091507 Percent
Probability of  42  is  2.941176470

# Probability of Likelihood of Evidence

In [268]:
target_dictionary=defaultdict(int)
def add(o):   
    target_dictionary[o] += 1

number_of_observations=0
target_vals = haberman_dataset.values[haberman_dataset.target]

for example in haberman_dataset.examples:
    targetval = example[haberman_dataset.target]
    add(targetval)
    
print(target_dictionary)
number_of_observations=0
for count in target_dictionary.values():
    number_of_observations+=count 
    
print('number_of_observations',number_of_observations)

defaultdict(<class 'int'>, {1: 225, 2: 81})
number_of_observations 306


In [269]:
# this method calculates the likelihood of evidence given the dataset object
def calculate_likekihood_ofEvidence(nursery_dataset):

    likelihood_evidence_dict=defaultdict(int)
    
    possible_target_class = nursery_dataset.values[nursery_dataset.target]

    # indexes of input features
    input_feature_indexes=nursery_dataset.inputs

    

    # For every feature index, filter the collumn based on the corresponding target class 
    # Put every filtered data point into respective bucket and return the dictionary of counts for each
    # feature class given the target class
    for i in input_feature_indexes:
        for c in possible_target_class:
            relevent_examples=[example[i] for example in nursery_dataset.examples if example[nursery_dataset.target]==c]

            for r_example in relevent_examples:
                
                likelihood_evidence_dict[(i,r_example,c)]+=1
                

    

    return likelihood_evidence_dict


In [270]:
calculate_likekihood_ofEvidence(haberman_dataset)


defaultdict(int,
            {(0, 30, 1): 3,
             (0, 31, 1): 2,
             (0, 33, 1): 2,
             (0, 34, 1): 5,
             (0, 35, 1): 2,
             (0, 36, 1): 2,
             (0, 37, 1): 6,
             (0, 38, 1): 9,
             (0, 39, 1): 5,
             (0, 40, 1): 3,
             (0, 41, 1): 7,
             (0, 42, 1): 7,
             (0, 43, 1): 7,
             (0, 44, 1): 4,
             (0, 45, 1): 6,
             (0, 46, 1): 3,
             (0, 47, 1): 8,
             (0, 48, 1): 4,
             (0, 49, 1): 8,
             (0, 50, 1): 10,
             (0, 51, 1): 4,
             (0, 52, 1): 10,
             (0, 53, 1): 5,
             (0, 54, 1): 9,
             (0, 55, 1): 8,
             (0, 56, 1): 5,
             (0, 57, 1): 8,
             (0, 58, 1): 7,
             (0, 59, 1): 7,
             (0, 60, 1): 4,
             (0, 61, 1): 6,
             (0, 62, 1): 4,
             (0, 63, 1): 7,
             (0, 64, 1): 5,
             (0, 65, 1): 6,
 

## Time to calculate the Likelihood of evidence
### Configure the variables `input_index`  `input_class` `target_class` to find the respective likelihood

In [271]:
likelihood_dict=calculate_likekihood_ofEvidence(haberman_dataset)
target_bucket=create_bucket( [example[haberman_dataset.target] for example in haberman_dataset.examples])
print(target_bucket)

# Index number of the input feature
input_index=0

# Input feature class for the above input feature
input_class=34

# Target class [1,2]
target_class=2

print('Numerator:',likelihood_dict[(input_index,input_class,target_class)])
print('Denominator: ',target_bucket[target_class])
likelihood=likelihood_dict[(input_index,input_class,target_class)] /target_bucket[target_class]
print('The Likelihood for Input Index:{0} Input Class:{1} Target Class:{2} is {3}'.format(input_index,input_class,target_class,likelihood))

defaultdict(<class 'int'>, {1: 225, 2: 81})
Numerator: 2
Denominator:  81
The Likelihood for Input Index:0 Input Class:34 Target Class:2 is 0.024691358024691357


# 2.2 Naive Bayes Learner

* source learning.ipynb

We have a dataset with a set of classes (**C**) and we want to classify an item with a set of features (**F**). Essentially what we want to do is predict the class of an item given the features.

For a specific class, **Class**, we will find the conditional probability given the item features:

$$P(Class|F) = \dfrac{P(F|Class)*P(Class)}{P(F)}$$

We will do this for every class and we will pick the maximum. This will be the class the item is classified in.

The features though are a vector with many elements. We need to break the probabilities up using the multiplication rule. Thus the above equation becomes:

$$P(Class|F) = \dfrac{P(Class)*P(F_{1}|Class)*P(F_{2}|Class)*...*P(F_{n}|Class)}{P(F_{1})*P(F_{2})*...*P(F_{n})}$$

The calculation of the conditional probability then depends on the calculation of the following:

*a)* The probability of **Class** in the dataset.

*b)* The conditional probability of each feature occurring in an item classified in **Class**.

*c)* The probabilities of each individual feature.

# Discrete Learner

## we shall use below functions developed to compute the Likelihood of evidence and Prior Probability to find the probability given the evidence and shall choose the max probability.

In [272]:
def likelihood_ofEvidence(nursery_dataset,possible_target_class=[0, 1]):

    likelihood_evidence_dict=defaultdict(int)
    likelihood_evidence_distribution_dict=defaultdict(int)
    

    
    input_feature_indexes=nursery_dataset.inputs

    target_bucket=create_bucket( [example[nursery_dataset.target] for example in nursery_dataset.examples])
    
    

    
    # counting the feature classes given the evidence from the examples.
    for i in input_feature_indexes:
        for c in possible_target_class:
            relevent_examples=[example[i] for example in nursery_dataset.examples if example[nursery_dataset.target]==c]

            for r_example in relevent_examples:
                
                likelihood_evidence_dict[(i,r_example,c)]+=1
    
    # For every feature index, filter the collumn based on the corresponding target class 
    # Put every filtered data point into respective bucket and return the dictionary of counts for each
    # feature class given the target class
    
    for i in input_feature_indexes:
        for c in possible_target_class:
            relevent_examples=[example[i] for example in nursery_dataset.examples if example[nursery_dataset.target]==c]

            for r_example in relevent_examples:
                
                likelihood_evidence_distribution_dict[(i,r_example,c)]= likelihood_dict[(i,r_example,c)] /target_bucket[c]

    
#     print(likelihood_evidence_dict)
#     print('\n\n')
#     print(likelihood_evidence_distribution_dict)
    return likelihood_evidence_dict


In [273]:
def target_distribution(nursery_dataset,target_class_list=[0,1]):
    target_dictionary=defaultdict(int)    

    # Finding the target Distribution
    for example in nursery_dataset.examples:
        targetval = example[nursery_dataset.target]
        target_dictionary[targetval] += 1
    print(target_dictionary)

    # calculating the total number of observations
    number_of_observations=0
    for count in target_dictionary.values():
        number_of_observations+=count 

    print('number_of_observations',number_of_observations)
    
    # calculating the probability of each target class
    target_distribution_dictionary=defaultdict(int)
    
    # calculating in percentage terms
    for target_class in target_class_list:
        target_distribution_dictionary[target_class]=target_dictionary[target_class]/number_of_observations
        
    print(target_distribution_dictionary)
    return target_distribution_dictionary

# Generating Target and Attribute distribution 

In [274]:
attr_dists=likelihood_ofEvidence(nursery_dataset,possible_target_class=[0, 1])

target_dist=target_distribution(nursery_dataset,target_class_list=[0,1])

defaultdict(<class 'int'>, {1: 579, 0: 333})
number_of_observations 912
defaultdict(<class 'int'>, {0: 0.3651315789473684, 1: 0.6348684210526315})


In [275]:
 def parse_data_point(data):
    
    test_list=data.split(sep=',')
    target=test_list[-1]

    print(test_list)
    test_list.pop()
    print(test_list)
    return [test_list,target]

## Predicting the Target Class for given example

In [276]:
def predict(example):
    def class_probability(targetval):
        return (target_dist[targetval] *
                product(attr_dists[(attr,example[attr],targetval )]
                        for attr in nursery_dataset.inputs))
    return argmax(target_vals, key=class_probability)


# example1=['usual','proper','complete',1,'convenient','convenient','nonprob','priority']
# example2=['usual','proper','complete','2','critical','convenient','problematic','recommended','priority']
# example3=['usual','proper','completed','1','convenient','convenient','nonprob','not_recom']


results=[]
data_point='usual,improper,complete,3,convenient,convenient,nonprob,not_recom,not_recom'
data=parse_data_point(data_point)

# a = map(str, [usual,proper,completed,1,convenient,convenient,nonprob,not_recom,not_recom])
predicted_target_class=predict(data[0])
expected_class=data[1]
results.append([expected_class,predicted_target_class])
print('Expected:',data[1])

if predicted_target_class==1:
    print('Predicted: priority')

else:
    print('Predicted: not_recom')
    

['usual', 'improper', 'complete', '3', 'convenient', 'convenient', 'nonprob', 'not_recom', 'not_recom']
['usual', 'improper', 'complete', '3', 'convenient', 'convenient', 'nonprob', 'not_recom']
Expected: not_recom
Predicted: priority


# Testing the Learner with a test data set

`url` https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/

In [277]:
attribute_names=['parents','has_nurs','form','children','housing' ,'finance' ,'social','health','target']
nursery_test_dataset= DataSet(examples=None, attrs=range(9), attrnames=attribute_names, target=-1,
                 inputs=None, values=None, distance=mean_boolean_error,
                 name='nursery_test', source='', exclude=())


print(nursery_test_dataset)
nursery_test_dataset.remove_examples("recommend")
nursery_test_dataset.remove_examples("very_recom")

print(nursery_test_dataset.values[nursery_test_dataset.target])

print("Class of first example:",nursery_test_dataset.examples[14][nursery_test_dataset.target])
nursery_test_dataset.classes_to_numbers()
print("Class of first example:",nursery_test_dataset.examples[14][nursery_test_dataset.target])
# print(nursery_test_dataset.values[nursery_test_dataset.target])
# print(len(nursery_test_dataset.values[0]))
# print()



<DataSet(nursery_test): 1000 examples, 9 attributes>
['priority', 'not_recom']
Class of first example: priority
Class of first example: 1


## Accuracy of the learner on the test data set

In [278]:
# data=parse_data_point(nursery_test_dataset.examples[0])
results=[]
for example in nursery_test_dataset.examples:

    expected_class=example[-1]
#     if expected_class==1:
#         expected_class='priority'
#     else:
#         expected_class='not_recom'
    
    predicted_target_class=predict(example)

    results.append([expected_class,predicted_target_class])
    

right_prediction_list = [result for result in results if result[0]==result[1]]
print('\n\n')


accuracy=len(right_prediction_list)/len(results)

print('Accuracy of the Model:',accuracy)

# Accuracy of the Model: 0.6392199349945829





Accuracy of the Model: 0.6392199349945829


# Continious Learner
* We shall use this data set to train and validate a Naive Bayes Continious Leaner which shall take continous value 
    and predict the target class.
* Continious Learners are different from Discrete as continous data may end up creating a huge set of likelihood  probabilities hence many distribution buckets and hence poor performance.
* We shall here assume every data to be Gaussian ( central limit theorem ) and hence use mean and standard deviation to predict the target value.

## Data set is a Audit Data set which contain information about the companies based on industries and their history.
* Attributes are used to predict if the company is a RISK or not .
* Many risk factors are examined from various areas like past records of audit office, audit-paras, environmental 
  conditions reports, firm reputation summary, on-going issues report, profit-value records, loss-value records,
  follow-up reports etc. After in-depth interview with the auditors, important risk factors are evaluated and
  their probability of existence is calculated from the present and past records.

In [279]:
attribute_names= ['Sector_score','LOCATION_ID','PARA_A','SCORE_A','PARA_B','SCORE_B','TOTAL','numbers','Marks','Money_Value','MONEY_Marks','District','Loss','LOSS_SCORE','History','History_score','Score','Risk']



print(len(attribute_names))
audit_risk_dataset= DataSet(examples=None, attrs=range(18), attrnames=attribute_names, target=-1,
                 inputs=None, values=None, distance=mean_boolean_error,
                 name='audit_risk_data', source='', exclude=())


print(audit_risk_dataset)


print(audit_risk_dataset.values[audit_risk_dataset.target])

print(audit_risk_dataset.examples[0])
# print(audit_risk_dataset.target)

# print(audit_risk_dataset.values[17])
# print(len(audit_risk_dataset.values))
# print()



18
<DataSet(audit_risk_data): 771 examples, 18 attributes>
[0, 1]
[3.89, 6, 0, 2, 4.83, 2, 4.83, 5, 2, 0.94, 2, 2, 0, 2, 0, 2, 2, 0]


In [280]:
def split_values_by_classes(iaudit_risk_dataset):
        """Split values into buckets according to their class."""
        buckets = defaultdict(list)

        for v in iaudit_risk_dataset.examples:
            target=v[-1]
            buckets[target].append(v[:-1])  # Add item to bucket of its class

        return buckets

In [281]:
def find_means_and_deviations(dataset,item_buckets):
    # Find the mean and standard deviation of each input feature class given the target class.
    
       
        target_names = dataset.values[dataset.target]
        feature_numbers = len(dataset.inputs)


        # initialize the dictionary of mean and deviations
        means = defaultdict(lambda: [0] * feature_numbers)
        deviations = defaultdict(lambda: [0] * feature_numbers)


        for t in target_names:
            # Find all the item feature values for item in class t
            features = [[] for i in range(feature_numbers)]
            for item in item_buckets[t]:
                for i in dataset.inputs:
                    features[i].append(item[i])

            # Calculate means and deviations of  the class
            for i in range(17):
                means[t][i] = mean(features[i])
                deviations[t][i] = stdev(features[i])

        return means, deviations

In [282]:
# Splitting the data set into buckets based on input feature classes
a=split_values_by_classes(audit_risk_dataset)

# Finding mean and devuation for each input class
means,deviations=find_means_and_deviations(audit_risk_dataset,a)


# This was major challlenge i faced while predicting the results
# It is important that the zero values are replaced with a minute value to allow calculations
for key,values  in means.items():
    for i,value in zip( range(len(values)) , values ):
        if value==0 or value==0.0:

            means[key][i]=0.001
for key,values  in deviations.items():
    for i,value in zip( range(len(values)) , values ):
        if value==0 or value==0.0:
            deviations[key][i]=0.001
            

print('***************means:',means)
print('\n\n')
print('***************devations:',deviations)


***************means: defaultdict(<function find_means_and_deviations.<locals>.<lambda> at 0x00000252DCF30948>, {0: [32.20174825174825, 14.265734265734265, 0.30055944055944056, 2, 0.25454895104895103, 2, 0.5551083916083916, 5, 2, 0.2635454545454545, 2, 2, 0.001, 2, 0.001, 2, 2], 1: [13.171278350515465, 15.210309278350515, 3.731238350515464, 4.412371134020619, 17.124721649484535, 3.8103092783505152, 20.805238350515463, 5.108247422680412, 2.379381443298969, 22.426329896907216, 3.4556701030927837, 2.808247422680412, 0.04742268041237113, 2.0989690721649485, 0.1670103092783505, 2.268041237113402, 3.123298969072165]})



***************devations: defaultdict(<function find_means_and_deviations.<locals>.<lambda> at 0x00000252DD0C93A8>, {0: [25.1447752255497, 9.460871254676208, 0.3088047727650804, 0.001, 0.4944437146924773, 0.001, 0.5845905933421018, 0.001, 0.001, 0.6904314006324352, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001], 1: [20.853889048450608, 10.131016573127088, 6.867803367921, 1

In [283]:
attr_dists_cont=likelihood_ofEvidence(audit_risk_dataset,possible_target_class=[0, 1])

target_dist_cont=target_distribution(audit_risk_dataset,target_class_list=[0,1])
target_vals=[0, 1]

defaultdict(<class 'int'>, {0: 286, 1: 485})
number_of_observations 771
defaultdict(<class 'int'>, {0: 0.37094682230869, 1: 0.62905317769131})


In [284]:
def predict_continious(example):
    def class_probability(targetval):
        prob = target_dist_cont[targetval]
        for attr in audit_risk_dataset.inputs:
            prob *= gaussian(means[targetval][attr], deviations[targetval][attr], example[attr])
        return prob

    return argmax(target_vals, key=class_probability)


print(predict_continious([3.89,7,1.1,4,7.41,4,8.51,5,2,44.95,6,2,0,2,0,2,3.2]))

1


In [285]:
attribute_names= ['Sector_score','LOCATION_ID','PARA_A','SCORE_A','PARA_B','SCORE_B','TOTAL','numbers','Marks','Money_Value','MONEY_Marks','District','Loss','LOSS_SCORE','History','History_score','Score','Risk']



print(len(attribute_names))
audit_risk_test_dataset= DataSet(examples=None, attrs=range(18), attrnames=attribute_names, target=-1,
                 inputs=None, values=None, distance=mean_boolean_error,
                 name='audit_risk_data_test', source='', exclude=())


print(audit_risk_test_dataset)


print(audit_risk_test_dataset.values[audit_risk_test_dataset.target])

print(audit_risk_test_dataset.examples[0])
# print(audit_risk_test_dataset.target)

# print(audit_risk_test_dataset.values[17])
# print(len(audit_risk_test_dataset.values))
# print()



18
<DataSet(audit_risk_data_test): 75 examples, 18 attributes>
[0, 1]
[1.85, 6, 3.7, 6, 0, 2, 3.7, 5, 2, 0.12, 2, 2, 0, 2, 0, 2, 2.4, 1]


In [286]:
predicted_target_class=predict_continious([3.89,7,1.1,4,7.41,4,8.51,5,2,44.95,6,2,0,2,0,2,3.2])
print(predicted_target_class)

1


In [287]:
# data=parse_data_point(audit_risk_test_dataset.examples[0])
results=[]
for example in audit_risk_test_dataset.examples:
    expected_class=example[-1]    
    predicted_target_class=predict_continious(example[:-1])
    results.append([expected_class,predicted_target_class])
    # print('Expected:',expected_class)
print(results)

right_prediction_list = [result for result in results if result[0]==result[1]]
print('\n\n')
# print(right_prediction_list)

accuracy=len(right_prediction_list)/len(results)

print('Accuracy of the Model:',accuracy)
# Accuracy of the Model: 1.0

[[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 0], [1, 1], [0, 0], [1, 1], [1, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [0, 0], [1, 1], [1, 1], [1, 1], [0, 0], [1, 1], [1, 1], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]



Accuracy of the Model: 1.0


# This learner produces a 100% accuracy