In [1]:
import pandas as pd
import numpy as np
import QDA_code

In [2]:
bank_full = pd.read_csv("bank-full.csv")
bank_full = bank_full[['age', 'balance', 'duration', 'y']]
bank_full.to_csv("bank-partial.csv", index = False)
bank = pd.read_csv("bank-partial.csv")
bank.head(5)

Unnamed: 0,age,balance,duration,y
0,58,2143,261,no
1,44,29,151,no
2,33,2,76,no
3,47,1506,92,no
4,33,1,198,no


### (a) Gaussian class conditional likelihoods with unequal varaince-covariance matrix: QDA

- Equal class priors and equal costs for misclassification 

In [3]:
model_qda = QDA_code.QDA('bank-partial.csv')

observation = [58, 261, 261]
model_qda.compute_likelihoods(observation)

given_priors = {
    "no": 1 / 2,
    "yes": 1 / 2,
}

model_qda.compute_probabilities(observation, given_priors)
model_qda.decision()

Likelihoods:  [2.10302060e-09 1.09725319e-09]

QDA Predicted Class: no

QDA Class Likelihoods:
no: 2.1030206042712077e-09
yes: 1.0972531944110306e-09

QDA probabilities: 
{'no': 1.0515103021356039e-09, 'yes': 5.486265972055153e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and misclassification costs are equal

In [4]:
model_qda2 = QDA_code.QDA('bank-partial.csv')

observation = [58, 261, 261]
model_qda2.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

model_qda2.compute_probabilities(observation, given_priors)
model_qda2.decision()

Likelihoods:  [2.10302060e-09 1.09725319e-09]

QDA Predicted Class: no

QDA Class Likelihoods:
no: 2.1030206042712077e-09
yes: 1.0972531944110306e-09

QDA probabilities: 
{'no': 1.892718543844087e-09, 'yes': 1.0972531944110307e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and the cost of classifying a customer as not a new service candidate when they are is 15 times the cost of classifying a customer as a new service customer

In [5]:
model_qda3 = QDA_code.QDA('bank-partial.csv')

observation = [58, 261, 261]
model_qda3.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

cost = {'TP': 0, 'FP': 1, 'FN': 15, 'TN': 0}

model_qda3.compute_probabilities(observation, given_priors)
model_qda3.cost_decision(cost)

Likelihoods:  [2.10302060e-09 1.09725319e-09]

QDA Predicted Class: no

QDA Class Likelihoods:
no: 2.1030206042712077e-09
yes: 1.0972531944110306e-09

QDA probabilities: 
{'no': 1.892718543844087e-09, 'yes': 1.0972531944110307e-10}

QDA Costs: 
No:  1.892718543844087e-09
Yes:  1.645879791616546e-09

Decision: yes



### (a) Gaussian class conditional likelihoods with equal varaince-covariance matrix: LDA

- Equal class priors and equal costs for misclassification 

In [6]:
import LDA_code

model_lda = LDA_code.LDA('bank-partial.csv')

observation = [58, 261, 261]
model_lda.compute_likelihoods(observation)

given_priors = {
    "no": 1 / 2,
    "yes": 1 / 2,
}

model_lda.compute_probabilities(observation, given_priors)
model_lda.decision()

LDA Predicted Class: no
LDA Class Likelihoods:
no: 1.956466109661075e-09
yes: 1.0728059317829899e-09
LDA probabilities: 
{'no': 9.782330548305375e-10, 'yes': 5.364029658914949e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and misclassification costs are equal

In [7]:
model_lda = LDA_code.LDA('bank-partial.csv')

observation = [58, 261, 261]
model_lda.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

model_lda.compute_probabilities(observation, given_priors)
model_lda.decision()

LDA Predicted Class: no
LDA Class Likelihoods:
no: 1.956466109661075e-09
yes: 1.0728059317829899e-09
LDA probabilities: 
{'no': 1.7608194986949675e-09, 'yes': 1.0728059317829899e-10}

Decision: no



- The prior for not selecting the new bank service is 0.9 and the cost of classifying a customer as not a new service candidate when they are is 15 times the cost of classifying a customer as a new service customer

In [8]:
model_lda = LDA_code.LDA('bank-partial.csv')

observation = [58, 261, 261]
model_lda.compute_likelihoods(observation)

given_priors = {
    "no": 0.9,
    "yes": 0.1,
}

cost = {'TP': 0, 'FP': 1, 'FN': 15, 'TN': 0}

model_lda.compute_probabilities(observation, given_priors)
model_lda.cost_decision()

LDA Predicted Class: no
LDA Class Likelihoods:
no: 1.956466109661075e-09
yes: 1.0728059317829899e-09
LDA probabilities: 
{'no': 1.7608194986949675e-09, 'yes': 1.0728059317829899e-10}

LDA Costs: 
No:  1.7608194986949675e-09
Yes:  1.0728059317829899e-10

Decision: yes



## Q2

In [9]:
mushroom_columns = list(pd.read_csv("MushroomVariables.txt", sep=",").columns)
mushroom = pd.read_csv("MushroomData.csv").dropna()
mushroom.columns = mushroom_columns
mushroom.head(3)

Unnamed: 0,edible_class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,WHITE,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS
1,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,PURPLE,SEVERAL,WOODS
2,EDIBLE,CONVEX,SMOOTH,WHITE,BRUISES,ALMOND,FREE,CROWDED,NARROW,PINK,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SEVERAL,WOODS


In [10]:
X = mushroom.drop(["edible_class"], axis = 1)
y = mushroom[['edible_class']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 1/3, random_state=42)

In [11]:
train = X_train.copy()
train['edible_class'] = y_train
test = X_test.copy()
test['edible_class'] = y_test
test.head(3)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,edible_class
1971,FLAT,FIBROUS,WHITE,NO,NONE,FREE,CROWDED,BROAD,BROWN,TAPERING,...,WHITE,WHITE,PARTIAL,WHITE,ONE,EVANESCENT,BLACK,SCATTERED,GRASSES,EDIBLE
4900,FLAT,FIBROUS,GRAY,NO,FOUL,FREE,CLOSE,BROAD,CHOCOLATE,ENLARGING,...,BROWN,BROWN,PARTIAL,WHITE,ONE,LARGE,CHOCOLATE,SOLITARY,PATHS,POISONOUS
2273,CONVEX,FIBROUS,GRAY,BRUISES,NONE,FREE,CLOSE,BROAD,PURPLE,TAPERING,...,GRAY,WHITE,PARTIAL,WHITE,ONE,PENDANT,BROWN,SOLITARY,WOODS,EDIBLE


In [12]:
word_edible = pd.DataFrame(columns = ['word', 'count|edible'])

for train_col in X_train.columns:
    a = train[train['edible_class'] == 'EDIBLE'][[train_col]].value_counts().rename_axis(train_col).reset_index(name='counts')
    a.columns = ['word', 'count|edible']
    word_edible = pd.concat([word_edible, a], axis = 0)

word_edible = word_edible.groupby(['word'], as_index=False)['count|edible'].sum()
word_edible.index = np.arange(0, len(word_edible))
word_edible.head(5)

Unnamed: 0,word,count|edible
0,?,470
1,ABUNDANT,342
2,ALMOND,244
3,ANISE,264
4,ATTACHED,123


In [13]:
word_poisonous = pd.DataFrame(columns = ['word', 'count|poisonous'])
for train_col in X_train.columns:
    a = train[train['edible_class'] == 'POISONOUS'][[train_col]].value_counts().rename_axis(train_col).reset_index(name='counts')
    a.columns = ['word', 'count|poisonous']
    word_poisonous = pd.concat([word_poisonous, a], axis = 0)
word_poisonous = word_poisonous.groupby(['word'], as_index=False)['count|poisonous'].sum()    
word_poisonous.index = np.arange(0, len(word_poisonous))
word_poisonous.head(5)

Unnamed: 0,word,count|poisonous
0,?,1185
1,ATTACHED,19
2,BELL,28
3,BLACK,189
4,BROAD,1137


In [14]:
word_df = pd.merge(word_edible, word_poisonous, how = 'outer').fillna(0)
word_df

Unnamed: 0,word,count|edible,count|poisonous
0,?,470.0,1185.0
1,ABUNDANT,342.0,0.0
2,ALMOND,244.0,0.0
3,ANISE,264.0,0.0
4,ATTACHED,123.0,19.0
...,...,...,...
63,GROOVES,0.0,3.0
64,LARGE,0.0,874.0
65,MUSTY,0.0,36.0
66,PUNGENT,0.0,176.0


In [15]:
P_edible = y_train.value_counts()[0] / len(X_train) # P(edible)
P_poisonous = y_train.value_counts()[1] / len(X_train) # P(poisonous)
print(P_edible, P_poisonous)

0.5306595365418895 0.46934046345811054


In [16]:
edible_words = word_df[['count|edible']].sum()[0]
poisonous_words = word_df[['count|edible']].sum()[0]
total_words = edible_words + poisonous_words
total_words

130988.0

In [45]:
edible_likelihoods = []
for i in range(len(word_df)):
    numerator = word_df.loc[i,"count|edible"] + 1.5
    denominator = total_words + 1
    result = numerator / denominator
    edible_likelihoods.append(result)
edible_likelihoods[:3]

[0.0035995388925787663, 0.0026223576025467786, 0.0018742031773660384]

In [46]:
poisonous_likelihoods = []
for i in range(len(word_df)):
    numerator = word_df.loc[i,"count|poisonous"] + 1.5
    denominator = total_words + 1
    result = numerator / denominator
    poisonous_likelihoods.append(result)
poisonous_likelihoods[:3]

[0.009058012504866821, 1.1451343242562353e-05, 1.1451343242562353e-05]

In [40]:
edible_posterior = np.array(edible_likelihoods) * P_edible
poisonous_posterior = np.array(poisonous_likelihoods) * P_poisonous

In [41]:
likelihood_df = pd.DataFrame({'words': word_df['word'].values, 
                              'edible_likelihood': edible_likelihoods, 'poisonous_likelihood': poisonous_likelihoods}).set_index(['words'])
likelihood_df

Unnamed: 0_level_0,edible_likelihood,poisonous_likelihood
words,Unnamed: 1_level_1,Unnamed: 2_level_1
?,0.003596,0.009054
ABUNDANT,0.002619,0.000008
ALMOND,0.001870,0.000008
ANISE,0.002023,0.000008
ATTACHED,0.000947,0.000153
...,...,...
GROOVES,0.000008,0.000031
LARGE,0.000008,0.006680
MUSTY,0.000008,0.000282
PUNGENT,0.000008,0.001351


In [44]:
edible_posterior = 1
edible_results = []
for i in range(0,len(X_test)):
    for naive_word in X_test.iloc[i].values:
        word_likelihood = likelihood_df.loc[naive_word, 'edible_likelihood']
        edible_posterior = edible_posterior * word_likelihood
    
    edible_posterior = edible_posterior * P_edible
    edible_results.append(edible_posterior)
edible_results

[1.0035279876744077e-39,
 7.188319368338135e-92,
 4.928214884384717e-131,
 5.707554923153211e-172,
 3.3466429889450113e-220,
 7.106703054541854e-270,
 4.4501341534e-313,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0

In [37]:
poisonous_posterior = 1
poisonous_results = []
for i in range(0,len(X_test)):
    for naive_word in X_test.iloc[i].values:
        word_likelihood = likelihood_df.loc[naive_word, 'poisonous_likelihood']
        poisonous_posterior *= word_likelihood
    poisonous_posterior *= P_poisonous
    poisonous_results.append(poisonous_posterior)
poisonous_results

  poisonous_posterior *= word_likelihood


[162699603992873.25,
 1.139799419936939e+28,
 2.374409574777128e+42,
 6.998784884272445e+56,
 1.4843513231017438e+70,
 3.8974549289112324e+83,
 2.9623787701787434e+97,
 1.6945795576555007e+111,
 3.491091943082844e+126,
 2.1947521400333924e+140,
 1.2860940521300326e+155,
 2.824541864513109e+168,
 2.342387144522317e+182,
 4.0904403780372775e+196,
 6.58226210765503e+209,
 5.410289774999722e+223,
 6.056163494627076e+236,
 3.341136314037183e+250,
 5.152483976161479e+263,
 3.697440482208385e+277,
 3.0397028925800684e+290,
 1.2813672198063967e+304,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 i

In [28]:
prediction = []
for i in range(len(edible_results)):
    if edible_results[i] >= poisonous_results[i]:
        prediction.append("EDIBLE")
    else:
        prediction.append("POISONOUS")

In [29]:
y_test

Unnamed: 0,edible_class
1971,EDIBLE
4900,POISONOUS
2273,EDIBLE
763,EDIBLE
7663,POISONOUS
...,...
5228,POISONOUS
8243,EDIBLE
1611,EDIBLE
4595,POISONOUS


In [30]:
y_test['prediction'] = prediction
y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['prediction'] = prediction


Unnamed: 0,edible_class,prediction
1971,EDIBLE,EDIBLE
4900,POISONOUS,POISONOUS
2273,EDIBLE,EDIBLE
763,EDIBLE,EDIBLE
7663,POISONOUS,EDIBLE
...,...,...
5228,POISONOUS,EDIBLE
8243,EDIBLE,EDIBLE
1611,EDIBLE,EDIBLE
4595,POISONOUS,EDIBLE
