In [1]:
import pandas as pd
import numpy as np
import QDA_code

In [2]:
bank_full = pd.read_csv("bank-full.csv")
bank_full = bank_full[['age', 'balance', 'duration', 'y']]
bank_full.to_csv("bank-partial.csv", index = False)
bank = pd.read_csv("bank-partial.csv")
bank.head(5)

Unnamed: 0,age,balance,duration,y
0,58,2143,261,no
1,44,29,151,no
2,33,2,76,no
3,47,1506,92,no
4,33,1,198,no


### (a) Gaussian class conditional likelihoods with unequal varaince-covariance matrix: QDA

- Equal class priors and equal costs for misclassification 

In [3]:
model_qda = QDA_code.QDA('bank-partial.csv')

observation = [58, 261, 261]
model_qda.compute_likelihoods(observation)

given_priors = {
    "no": 1 / 2,
    "yes": 1 / 2,
}

model_qda.compute_probabilities(observation, given_priors)
model_qda.decision()

Likelihoods:  [2.10302060e-09 1.09725319e-09]

QDA Predicted Class: no

QDA Class Likelihoods:
no: 2.1030206042712077e-09
yes: 1.0972531944110306e-09

QDA probabilities: 
{'no': 1.0515103021356039e-09, 'yes': 5.486265972055153e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and misclassification costs are equal

In [4]:
model_qda2 = QDA_code.QDA('bank-partial.csv')

observation = [58, 261, 261]
model_qda2.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

model_qda2.compute_probabilities(observation, given_priors)
model_qda2.decision()

Likelihoods:  [2.10302060e-09 1.09725319e-09]

QDA Predicted Class: no

QDA Class Likelihoods:
no: 2.1030206042712077e-09
yes: 1.0972531944110306e-09

QDA probabilities: 
{'no': 1.892718543844087e-09, 'yes': 1.0972531944110307e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and the cost of classifying a customer as not a new service candidate when they are is 15 times the cost of classifying a customer as a new service customer

In [5]:
model_qda3 = QDA_code.QDA('bank-partial.csv')

observation = [58, 261, 261]
model_qda3.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

cost = {'FP': 1, 'FN': 15}

model_qda3.compute_probabilities(observation, given_priors)
model_qda3.cost_decision(cost)

Likelihoods:  [2.10302060e-09 1.09725319e-09]

QDA Predicted Class: no

QDA Class Likelihoods:
no: 2.1030206042712077e-09
yes: 1.0972531944110306e-09

QDA probabilities: 
{'no': 1.892718543844087e-09, 'yes': 1.0972531944110307e-10}

QDA Costs: 
No:  1.892718543844087e-09
Yes:  1.645879791616546e-09

Decision: no



### (a) Gaussian class conditional likelihoods with equal varaince-covariance matrix: LDA

- Equal class priors and equal costs for misclassification 

In [6]:
import LDA_code

model_lda = LDA_code.LDA('bank-partial.csv')

observation = [58, 261, 261]
model_lda.compute_likelihoods(observation)

given_priors = {
    "no": 1 / 2,
    "yes": 1 / 2,
}

model_lda.compute_probabilities(observation, given_priors)
model_lda.decision()

LDA Predicted Class: no
LDA Class Likelihoods:
no: 1.956466109661075e-09
yes: 1.0728059317829899e-09
LDA probabilities: 
{'no': 9.782330548305375e-10, 'yes': 5.364029658914949e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and misclassification costs are equal

In [7]:
model_lda = LDA_code.LDA('bank-partial.csv')

observation = [58, 261, 261]
model_lda.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

model_lda.compute_probabilities(observation, given_priors)
model_lda.decision()

LDA Predicted Class: no
LDA Class Likelihoods:
no: 1.956466109661075e-09
yes: 1.0728059317829899e-09
LDA probabilities: 
{'no': 1.7608194986949675e-09, 'yes': 1.0728059317829899e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and the cost of classifying a customer as not a new service candidate when they are is 15 times the cost of classifying a customer as a new service customer

In [8]:
model_lda = LDA_code.LDA('bank-partial.csv')

observation = [58, 261, 261]
model_lda.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

cost = {'FP': 1, 'FN': 15}

model_lda.compute_probabilities(observation, given_priors)
model_lda.cost_decision(cost)

LDA Predicted Class: no
LDA Class Likelihoods:
no: 1.956466109661075e-09
yes: 1.0728059317829899e-09
LDA probabilities: 
{'no': 1.7608194986949675e-09, 'yes': 1.0728059317829899e-10}

LDA Costs: 
No:  1.7608194986949675e-09
Yes:  1.6092088976744848e-09

Decision: no



## Q2

In [9]:
mushroom_columns = list(pd.read_csv("MushroomVariables.txt", sep=",").columns)
mushroom = pd.read_csv("MushroomData.csv").dropna()
mushroom.columns = mushroom_columns
mushroom.head(3)

Unnamed: 0,edible_class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS


In [10]:
X = mushroom.drop(["edible_class"], axis = 1)
y = mushroom[['edible_class']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 1/3, random_state=42)

In [11]:
train = X_train.copy()
train['edible_class'] = y_train
test = X_test.copy()
test['edible_class'] = y_test
test.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,edible_class
1971,FLAT,FIBROUS,WHITE,NO,NONE,FREE,CROWDED,BROAD,BROWN,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,EVANESCENT,BLACK,SCATTERED,GRASSES,EDIBLE
4900,FLAT,FIBROUS,GRAY,NO,FOUL,FREE,CLOSE,BROAD,CHOCOLATE,ENLARGING,...,BROWN,BROWN,PARTIAL,WHITE,ONE,LARGE,CHOCOLATE,SOLITARY,PATHS,POISONOUS
2273,CONVEX,FIBROUS,GRAY,BRUISES,NONE,FREE,CLOSE,BROAD,PURPLE,TAPERING,...,GRAY,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SOLITARY,WOODS,EDIBLE


In [12]:
word_edible = pd.DataFrame(columns = ['word', 'count|edible'])

for train_col in X_train.columns:
    a = train[train['edible_class'] == 'EDIBLE'][[train_col]].value_counts().rename_axis(train_col).reset_index(name='counts')
    a['column'] = train_col
    a.columns = ['word', 'count|edible', 'column']
    word_edible = pd.concat([word_edible, a], axis = 0)

word_edible.set_index(['column'])

Unnamed: 0_level_0,word,count|edible
column,Unnamed: 1_level_1,Unnamed: 2_level_1
cap-shape,CONVEX,1338
cap-shape,FLAT,1192
cap-shape,BELL,262
cap-shape,KNOBBED,163
cap-shape,SUNKEN,22
...,...,...
habitat,LEAVES,170
habitat,MEADOWS,155
habitat,WASTE,128
habitat,PATHS,91


In [13]:
word_poisonous = pd.DataFrame(columns = ['word', 'count|poisonous'])

for train_col in X_train.columns:
    a = train[train['edible_class'] == 'POISONOUS'][[train_col]].value_counts().rename_axis(train_col).reset_index(name='counts')
    a['column'] = train_col
    a.columns = ['word', 'count|poisonous', 'column']
    word_poisonous = pd.concat([word_poisonous, a], axis = 0)

word_poisonous.set_index(['column'])

Unnamed: 0_level_0,word,count|poisonous
column,Unnamed: 1_level_1,Unnamed: 2_level_1
cap-shape,CONVEX,1154
cap-shape,FLAT,1047
cap-shape,KNOBBED,400
cap-shape,BELL,28
cap-shape,CONICAL,4
...,...,...
habitat,PATHS,671
habitat,GRASSES,504
habitat,LEAVES,398
habitat,URBAN,173


In [14]:
word_df = pd.merge(word_edible, word_poisonous, how = 'outer').fillna(0)
word_df = word_df[['word', 'column', 'count|edible', 'count|poisonous']]
word_df

Unnamed: 0,word,column,count|edible,count|poisonous
0,CONVEX,cap-shape,1338,1154
1,FLAT,cap-shape,1192,1047
2,BELL,cap-shape,262,28
3,KNOBBED,cap-shape,163,400
4,SUNKEN,cap-shape,22,0
...,...,...,...,...
112,YELLOW,veil-color,0,6
113,NONE,ring-number,0,36
114,LARGE,ring-type,0,874
115,NONE,ring-type,0,36


In [15]:
P_edible = y_train.value_counts()[0] / len(X_train) # P(edible)
P_poisonous = y_train.value_counts()[1] / len(X_train) # P(poisonous)
print(P_edible, P_poisonous)

0.5306595365418895 0.46934046345811054


In [17]:
alpha = 1

In [18]:
edible_likelihoods = []
edible_len = len(train[train['edible_class'] == 'EDIBLE'])
for i in range(len(word_df)):
    numerator = word_df.loc[i,"count|edible"] + alpha
    denominator = edible_len
    result = numerator / denominator
    edible_likelihoods.append(result)
edible_likelihoods[:3]

[0.4497816593886463, 0.4007389989922741, 0.0883439704400403]

In [19]:
poisonous_likelihoods = []
poisonous_len = len(train[train['edible_class'] == 'POISONOUS'])
for i in range(len(word_df)):
    numerator = word_df.loc[i,"count|poisonous"] + alpha
    denominator = poisonous_len
    result = numerator / denominator
    poisonous_likelihoods.append(result)
poisonous_likelihoods[:3]

[0.43866312191416634, 0.3980250664641094, 0.011014052411697683]

In [20]:
edible_posterior = np.array(edible_likelihoods) * P_edible
poisonous_posterior = np.array(poisonous_likelihoods) * P_poisonous

In [21]:
likelihood_df = pd.DataFrame({'words': word_df['word'].values, 
                              'edible_likelihood': edible_likelihoods, 'poisonous_likelihood': poisonous_likelihoods,
                             'column': word_df['column'].values }).set_index(['words'])
likelihood_df

Unnamed: 0_level_0,edible_likelihood,poisonous_likelihood,column
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CONVEX,0.449782,0.438663,cap-shape
FLAT,0.400739,0.398025,cap-shape
BELL,0.088344,0.011014,cap-shape
KNOBBED,0.055089,0.152298,cap-shape
SUNKEN,0.007726,0.000380,cap-shape
...,...,...,...
YELLOW,0.000336,0.002659,veil-color
NONE,0.000336,0.014052,ring-number
LARGE,0.000336,0.332321,ring-type
NONE,0.000336,0.014052,ring-type


In [23]:
edible_results = []
all_columns = X_test.columns
for i in range(0,len(X_test)):
    edible_posterior = 1
    for j in range(len(X_test.iloc[i])):
        naive_word = X_test.iloc[i].values[j]
        target_column = all_columns[j]
        word_likelihood = likelihood_df[likelihood_df['column'] == target_column].loc[naive_word, 'edible_likelihood']
        edible_posterior = edible_posterior * word_likelihood
        
    edible_posterior = edible_posterior * P_edible
    edible_results.append(edible_posterior)
edible_results[:5]

[1.6046741561497368e-08,
 2.9148712493640794e-23,
 3.4014089957127444e-07,
 7.897248022415693e-10,
 1.5807924720326101e-18]

In [24]:
poisonous_results = []
all_columns = X_test.columns
for i in range(0,len(X_test)):
    poisonous_posterior = 1
    for j in range(len(X_test.iloc[i])):
        naive_word = X_test.iloc[i].values[j]
        target_column = all_columns[j]
        word_likelihood = likelihood_df[likelihood_df['column'] == target_column].loc[naive_word, 'poisonous_likelihood']
        poisonous_posterior *= word_likelihood
    poisonous_posterior *= P_poisonous
    poisonous_results.append(poisonous_posterior)
poisonous_results[:5]

[1.0293260890554539e-15,
 9.576710303313879e-10,
 6.637472118072848e-16,
 1.146357541800574e-17,
 4.561890257031186e-08]

In [25]:
prediction = []
for i in range(len(edible_results)):
    if edible_results[i] >= poisonous_results[i]:
        prediction.append("EDIBLE")
    else:
        prediction.append("POISONOUS")
prediction[:10]

['EDIBLE',
 'POISONOUS',
 'EDIBLE',
 'EDIBLE',
 'POISONOUS',
 'POISONOUS',
 'POISONOUS',
 'POISONOUS',
 'EDIBLE',
 'EDIBLE']

In [26]:
y_test['prediction'] = prediction
y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['prediction'] = prediction


Unnamed: 0,edible_class,prediction
1971,EDIBLE,EDIBLE
4900,POISONOUS,POISONOUS
2273,EDIBLE,EDIBLE
763,EDIBLE,EDIBLE
7663,POISONOUS,POISONOUS
...,...,...
5228,POISONOUS,POISONOUS
8243,EDIBLE,EDIBLE
1611,EDIBLE,EDIBLE
4595,POISONOUS,POISONOUS


### 95% Accuracy achieved

In [27]:
errors = (y_test['edible_class'] != y_test['prediction']).sum() 
(len(y_test) - errors) / len(y_test)

0.9536541889483066

In [28]:
# alpha = 100: 89%
# alpha = 10: 93%
# alpha = 1: 95%