In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import math

## Function Definitions

In [2]:
def readExcelSheet1(excelfile):
    from pandas import read_excel
    return (read_excel(excelfile)).values

In [3]:
def readExcelRange(excelfile,sheetname="Sheet1",startrow=1,endrow=1,startcol=1,endcol=1):
    from pandas import read_excel
    values=(read_excel(excelfile, sheetname,header=None)).values;
    return values[startrow-1:endrow,startcol-1:endcol]

In [4]:
def readExcel(excelfile,**args):
    if args:
        data=readExcelRange(excelfile,**args)
    else:
        data=readExcelSheet1(excelfile)
    if data.shape==(1,1):
        return data[0,0]
    elif (data.shape)[0]==1:
        return data[0]
    else:
        return data

In [5]:
def writeExcelData(x,excelfile,sheetname,startrow,startcol):
    from pandas import DataFrame, ExcelWriter
    from openpyxl import load_workbook
    df=DataFrame(x)
    book = load_workbook(excelfile)
    writer = ExcelWriter(excelfile, engine='openpyxl') 
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df.to_excel(writer, sheet_name=sheetname,startrow=startrow-1, startcol=startcol-1, header=False, index=False)
    writer.save()
    writer.close()

In [6]:
def getSheetNames(excelfile):
    from pandas import ExcelFile
    return (ExcelFile(excelfile)).sheet_names

## Prepare Data

In [10]:
excelfile='/Users/eve7947/Downloads/Resubmit_Hsing-Yi-Wang_Assignment_2_Data_and_Template_0221.xlsx'

In [11]:
sheets=getSheetNames(excelfile);
sheets

['Data',
 'Female Histogram',
 'Male Histogram',
 'Bayesian',
 'Queries',
 'Reconstructed Female Histogram',
 'Reconstructed Male Histogram',
 'Observations',
 'ID']

In [57]:
data=readExcel(excelfile)
X=np.array(data[:,1:3],dtype=float)
T=np.array([str(g) for g in data[:,0]])

## Histogram Classifier

In [47]:
def Build2DHistogramClassifier(X,T,B,xmin,xmax):
    HF=np.zeros((B,B)).astype('int32');
    HM=np.zeros((B,B)).astype('int32');
    binindices=(np.round(((B-1)*(X-xmin)/(xmax-xmin)))).astype('int32');
    for i,r in enumerate(binindices):
        if T[i]=='Female':
            HF[r[0],r[1]]+=1;
        else:
            HM[r[0],r[1]]+=1;
    return [HF, HM]

In [48]:
def Apply2DHistogramClassifier(queries,HF,HM,xmin,xmax):
    B=np.alen(HF);
    binindices=(np.round(((B-1)*(queries-xmin)/(xmax-xmin)))).astype('int32');
    countF=HF[binindices[:,0],binindices[:,1]];
    countM=HM[binindices[:,0],binindices[:,1]];
    resultlabel=np.full(np.alen(queries),"Indeterminate",dtype=object);
    resultprob=np.full(np.alen(queries),np.nan,dtype=object);
    indicesF=countF>countM;
    indicesM=countM>countF;
    resultlabel[indicesF]="F";
    resultlabel[indicesM]="M";
    probF=countF/(countF+countM);
    probM=countM/(countF+countM);
    resultprob[indicesF]=probF[indicesF];
    resultprob[indicesM]=probM[indicesM];
    return resultlabel, resultprob

In [49]:
queries=(readExcel(excelfile,
                  sheetname='Queries',
                  startrow=3,
                  endrow=6,
                  startcol=1,
                  endcol=2)).astype(float)
queries

array([[69. , 17.5],
       [66. , 22. ],
       [70. , 21.5],
       [69. , 23.5]])

In [54]:
B=7
xmin=np.amin(X,axis=0)
xmax=np.amax(X,axis=0)
[HF, HM] = Build2DHistogramClassifier(X,T,B,xmin,xmax)

In [55]:
[resultlabel, resultprob]=Apply2DHistogramClassifier(queries,HF,HM,xmin,xmax)

In [56]:
print(DataFrame([resultlabel, resultprob]).T)

               0         1
0  Indeterminate       NaN
1              M  0.846154
2              M  0.833333
3              M         1


## Bayesian classifier

In [25]:
def Build2DBayesianClassifer(X,T):
    muF=np.mean(X[T=='Female'],axis=0)
    muM=np.mean(X[T=='Male'],axis=0)
    sigmaF=np.cov(X[T=='Female'],rowvar=False)
    sigmaM=np.cov(X[T=='Male'],rowvar=False)
    NF=len(T[T=='Female'])
    NM=len(T[T=='Male'])
    return[muF,muM,sigmaF,sigmaM,NF,NM]

In [31]:
[muF,muM,sigmaF,sigmaM,NF,NM] = Build2DBayesianClassifer(X,T)

In [32]:
print(muF,muM,sigmaF,sigmaM,NF,NM)

[65.25280899 19.6011236 ] [71.28846154 22.30128205] [[7.75780452 1.65170135]
 [1.65170135 1.75670327]] [[7.08778721 1.80157343]
 [1.80157343 2.06064769]] 89 78


In [33]:
def pdf(x,mu,sigma):
    d=np.alen(mu)
    dfact1=(2*np.pi)**d
    dfact2=np.linalg.det(sigma)
    fact=1/np.sqrt(dfact1*dfact2)
    xc=x-mu
    isigma=np.linalg.inv(sigma)
    return fact*np.exp(-0.5*np.einsum('ij,jk,ik->i',xc,isigma,xc))

In [39]:
def Apply2DBayesianClassifer(queries,muF,muM,sigmaF,sigmaM,NF,NM):
    A=1
    countF=NF*A*pdf(queries,muF,sigmaF)
    countM=NM*A*pdf(queries,muM,sigmaM)
    resultlabel=np.full(np.alen(queries),"Indeterminate",dtype=object);
    resultprob=np.full(np.alen(queries),np.nan,dtype=object);
    indicesF=countF>countM;
    indicesM=countM>countF;
    resultlabel[indicesF]="F";
    resultlabel[indicesM]="M";
    probF=countF/(countF+countM);
    probM=countM/(countF+countM);
    resultprob[indicesF]=probF[indicesF];
    resultprob[indicesM]=probM[indicesM];
    return resultlabel, resultprob

In [44]:
queries=(readExcel(excelfile,
                  sheetname='Queries',
                  startrow=3,
                  endrow=6,
                  startcol=1,
                  endcol=2)).astype(float)
queries

array([[69. , 17.5],
       [66. , 22. ],
       [70. , 21.5],
       [69. , 23.5]])

In [45]:
[resultlabel, resultprob]=Apply2DBayesianClassifer(queries,muF,muM,sigmaF,sigmaM,NF,NM)

In [46]:
print(DataFrame([resultlabel, resultprob]).T)

   0         1
0  F  0.898536
1  F  0.655579
2  M  0.801143
3  M  0.943548
