# Assignment # 10  
Park Juyeon, Department of Statistics and Data Science, 2022311137

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from factor_analyzer import FactorAnalyzer
from numpy.linalg import inv

pd.options.display.float_format = '{:.5f}'.format
np.set_printoptions(suppress=True)
warnings.filterwarnings(action='ignore')
plt.rcParams['figure.facecolor'] = 'white'

----

# Q1. 
### Write a python code to calculate the linear discriminant function for binary class. Your code should be able to predict the Y class based on the input value $X_0$

In [2]:
def LDF(x_zero, Xdata, ydata, p1, p2):
    
    # Dividing data into two groups
    y1, y2 = pd.unique(ydata)
    data1 = Xdata[ydata == y1]
    data2 = Xdata[ydata == y2]

    # Calculation
    x1_bar = data1.mean()
    x2_bar = data2.mean()
    n1 = len(data1)
    n2 = len(data2)
    S_pooled = ((n1-1)/(n1+n2-2))*data1.cov() + ((n2-1)/(n1+n2-2))*data2.cov() 
    S_inv = inv(S_pooled)
    
    # Linear Discriminant Function
    ldf_ftn = ((x1_bar - x2_bar) @ S_inv @ x_zero.T) - ((1 / 2) * (x1_bar - x2_bar) @ S_inv @ (x1_bar + x2_bar))
    calculation = np.log(p2 / p1)
    criteria = ldf_ftn - calculation
    
    # Prediction
    predicted_y = np.where(criteria >= 0, y1, y2)
    
    # Posterior probability
    f1 = np.zeros(len(x_zero))
    f2 = np.zeros(len(x_zero))
    for i in range(len(x_zero)):
        f1[i] = p1 * np.exp((-1/2) * (x_zero.iloc[i,] - x1_bar)@S_inv@((x_zero.iloc[i,] - x1_bar).T))
        f2[i] = p2 * np.exp((-1/2) * (x_zero.iloc[i,] - x2_bar)@S_inv@((x_zero.iloc[i,] - x2_bar).T))
    
    posterior_wild = f1 / (f1 + f2)
    posterior_domestic = f2 / (f1 + f2)
    
    # Summary
    result = pd.DataFrame()
    result['From Type'] = ydata
    result['Predicted Type'] = predicted_y
    result['LDF value'] = criteria
    result[f'Posterior_{y1}'] = np.round(posterior_wild, 5)
    result[f'Posterior_{y2}'] = np.round(posterior_domestic, 5)
    return(result)

# Q2.
### Write a python code to perform the ‘leave-one-out’ method to calculate the accuracy of the LDA model you wrote in #1.

In [3]:
def LOO(x_zero, Xdata, ydata, p1, p2):
    
    # Leave-one for sentence
    predicted_y = []
    for i in range(len(Xdata)):
        
        X_temp = Xdata.drop([i], axis = 0)
        xzero_temp = x_zero.drop([i], axis = 0)
        y_temp = ydata.drop([i], axis = 0)
        
        y1, y2 = pd.unique(y_temp)
        data1 = X_temp[y_temp == y1]
        data2 = X_temp[y_temp == y2]
        
        x1_bar = data1.mean()
        x2_bar = data2.mean()
        n1 = len(data1)
        n2 = len(data2)
        S_pooled = ((n1-1)/(n1+n2-2))*data1.cov() + ((n2-1)/(n1+n2-2))*data2.cov() 
        S_inv = inv(S_pooled)
        
        # Linear Discriminant Function for ith row
        ldf_ftn = ((x1_bar-x2_bar)@S_inv@x_zero.loc[i]) - ((1/2)*(x1_bar-x2_bar)@S_inv@(x1_bar+x2_bar))
        calculation = np.log(p2/p1)
        criteria = ldf_ftn - calculation
    
        # Prediction
        predicted_y.append(np.where(criteria >= 0, y1, y2))
    
    # Table 
    result = pd.DataFrame()
    result['From Type'] = y
    result['Predicted Type'] = predicted_y
    
    n1 = n2 = n3 = n4 = 0
    for i in range(len(result)):
        if (result['From Type'][i] == y1) and (result['Predicted Type'][i] == y1 ):
            n1 += 1
        elif (result['From Type'][i] == y1) and (result['Predicted Type'][i] != y1 ):
            n2 += 1
        elif (result['From Type'][i] == y2) and (result['Predicted Type'][i] != y2 ):
            n3 += 1
        else:
            n4 +=1
    
    table_accuracy = pd.DataFrame([[n1, n2, n1+n2], [n3, n4, n3+n4], [n1+n3, n2+n4, n1+n2+n3+n4]], 
                                   columns = ['WILD', 'DOMESTIC', 'Total'],
                                   index = ['WILD', 'DOMESTIC', 'Total'], dtype = int)
    table_accuracy.index.names = ['From Type']; table_accuracy.columns.names = ['Classified Type']
    Acc_rate = np.round((n1+n4)/(n1+n2+n3+n4), 3)
    
    print(f'Accuracy rate of this data is {Acc_rate}.')
    return(table_accuracy)

# Q3.
### Consider the Turkey (male) data example in class. Suppose that the prior probability for domestic turkey is 0.60 and that of wild turkey is 0.4. Assume that the covariance matrices of wild and domestic turkeys are the same. Also assume the equal misclassification cost. You must use your own python code in #1 & #2 above for this question.

In [4]:
# Read data
turkey = pd.read_csv('turkey.dat', delim_whitespace =True)
turkey.head()
# Use only male data
turkey2 = turkey.loc[turkey['SEX'] == 'MALE',
                     ['ID', 'TYPE', 'HUM', 'RAD', 'ULN', 'FEMUR', 'TIN', 'CAR', 'D3P', 'COR', 'SCA']]
turkey2 = turkey2.replace('.', np.nan).dropna().reset_index(drop = True)
print(turkey2.shape)
turkey2.head()

(33, 11)


Unnamed: 0,ID,TYPE,HUM,RAD,ULN,FEMUR,TIN,CAR,D3P,COR,SCA
0,B710,WILD,153,140,147,142,151,817,305,102,128
1,B790,WILD,156,137,151,146,155,814,305,111,137
2,B819,WILD,158,135,151,146,152,790,289,111,125
3,B085,WILD,148,129,146,139,147,767,287,106,123
4,B089,WILD,157,140,154,140,159,818,301,116,136


In [5]:
X = (turkey2.drop(['ID', 'TYPE'], axis = 1)).astype(float)
y = turkey2['TYPE']

## a.	Which turkeys in this data set were misclassified by the discriminant rule when the rule was applied to the training data?

In [6]:
Turkey_LDF = LDF(X, X, y, 0.4, 0.6)
turkey2[Turkey_LDF['From Type'] != Turkey_LDF['Predicted Type']]

Unnamed: 0,ID,TYPE,HUM,RAD,ULN,FEMUR,TIN,CAR,D3P,COR,SCA
0,B710,WILD,153,140,147,142,151,817,305,102,128
24,L750,DOMESTIC,149,130,147,140,147,770,300,104,126


Turkeys whose IDs are B710 and L750 are misclassified by the discriminant rule

## b.	What are the posterior probabilities for both domestic and wild classifications for those turkeys that were misclassified in (a)?

In [7]:
Turkey_LDF.loc[[0, 24]]

Unnamed: 0,From Type,Predicted Type,LDF value,Posterior_WILD,Posterior_DOMESTIC
0,WILD,DOMESTIC,-0.67201,0.33805,0.66195
24,DOMESTIC,WILD,1.02263,0.73548,0.26452


## c.	Determine the value of each of the linear discriminant function for turkeys whose IDs are B710 and L674. How do you classify these two turkeys?

In [8]:
Turkey_LDF.loc[turkey2.loc[turkey2['ID'].isin(['B710','L674'])].index]

Unnamed: 0,From Type,Predicted Type,LDF value,Posterior_WILD,Posterior_DOMESTIC
0,WILD,DOMESTIC,-0.67201,0.33805,0.66195
14,DOMESTIC,DOMESTIC,-9.26488,9e-05,0.99991


Turkey whose ID is B710, it is classified into Domestic because LDF value is less than 0. But in fact, it is wild type subject.  
Turkey whose Id is L674, it it classified into Domestic too becuase LDF value is less than 0. It is well classified.

## d.	Calculate the ‘leave-one-out’ accuracy of the LDA model. 

In [9]:
LOO(X, X, y, 0.4, 0.6)

Accuracy rate of this data is 0.848.


Classified Type,WILD,DOMESTIC,Total
From Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
WILD,11,3,14
DOMESTIC,2,17,19
Total,13,20,33
