In [169]:
import numpy as np #import numpy library
ionosphere=np.genfromtxt("ionosphere.txt",delimiter=",") #load data from textfile

#load iris dataset and related methods
from sklearn.datasets import load_iris
iris=load_iris()
from sklearn.model_selection import train_test_split

In [170]:
from math import sqrt

In [171]:
def eucliDistance(training,test):
    '''It calculates the euclidean distance of two vectors'''
    if (training.size==test.size):
        arrlen=training.size
        sum=0
        for i in range(0,arrlen):
            sum+=(training[i]-test[i])**2
        return sqrt(sum)

In [172]:
import math
def NNClassifier(trainset,trainlabels,testTarget):
    '''It finds the training sample with the min distance from the test sample.'''
    distances=list()
    for train_row in trainset:
        distances.append(eucliDistance(train_row,testTarget))
    minDist=math.inf
    curPos=0
    for i in range(0,len(distances)):
        if (distances[i]<minDist):
            minDist=distances[i]
            curPos=i
    return (minDist,trainlabels[curPos]) #minDist,(predicted) label of the closest vecctor

In [173]:
def cfScore(sampleset,labelset,targetSample,targetClass,targetIdx):
    if (sampleset.shape[0]!=labelset.shape[0]):
        return "Invalid size of sample set or label set!"
    diffClassSamples=list() #list of samples from different class
    diffClassLabels=list()
    sameClassSamples=list() #list of samples from same class
    sameClassLabels=list()
    for i in range(sampleset.shape[0]):
        if (i!=targetIdx):
            if (labelset[i]==targetClass):
                sameClassSamples.append(sampleset[i])
                sameClassLabels.append(labelset[i])
            else:
                diffClassSamples.append(sampleset[i])
                diffClassLabels.append(labelset[i])
    conformityScore=NNClassifier(diffClassSamples,diffClassLabels,targetSample)[0]
    try:
        conformityScore/=NNClassifier(sameClassSamples,sameClassLabels,targetSample)[0]
    except ZeroDivisionError:
        return math.inf
    return conformityScore

In [174]:
def findAllCfScores(trainsamples,trainlabels,testsamples,postlabels):
    #concatenate trainsamples and testsamples
    for i in range(trainsamples.shape[0]):
        extrasamples=np.concatenate((trainsamples,testsamples),axis=0)
    #also concatenate trainlabels and postlabels
    for i in range(trainlabels.shape[0]):
        extralabels=np.concatenate((trainlabels,postlabels),axis=0)
    #cur cf --> call cfScore()
    cfscores=np.array([])
    for k in range(0,extrasamples.shape[0]):
        targetSample=extrasamples[k]
        targetClass=extralabels[k]
        curCf=cfScore(extrasamples,extralabels,targetSample,targetClass,k)
        cfscores=np.append(cfscores,curCf) #add cur cf to list
    return cfscores

In [175]:
def rank(target,arr):
    '''give the rank for an item in array: smallest (1) to largest (n).'''
    allItems=set()
    for item in arr:
        allItems.add(item)
    if (target not in allItems):
        return None #error
    rank=1
    for uniqItem in allItems:
        if (uniqItem==target):
            return rank
        else:
            if (uniqItem<target):
                rank+=1

In [176]:
def avgFalsePVal(train_size,test_size,sampleset,labelset,truelabelset,cfScores):
    if (train_size+test_size!=sampleset.shape[0]):
        return "Invalid train size or test size!"
    if (sampleset.shape[0]!=labelset.shape[0]!=cfScores.shape[0]):
        return "Invalid size of either sample set, label set, or conformity scores list!"
    pVals=np.zeros(cfScores.shape[0])
    for i in range(0,pVals.shape[0]):
        #rank each cf score
        curRank=rank(cfScores[i],cfScores)
        #calc p value
        curPVal=curRank/cfScores.shape[0]
        #add to p values list
        pVals[i]=curPVal
    avg=0
    for i in range(train_size,pVals.shape[0]):
        if (labelset[i]!=truelabelset[i-train_size]):
            #a false p value is found
            pval=pVals[i]
            avg+=pval
    avg/=cfScores.shape[0]
    return (avg,pVals)

In [177]:
def displayResults(xtrain,ytrain,xtest,ytest):
    cf_prediction={"p-values":[],"Predicted list":[],"Average False P-value":0,"Error Rate":0,"Number of errors":0}
    
    xtest_rows,xtest_cols=xtest.shape
    y_pred=np.zeros(xtest_rows)
    for k in range(0,xtest_rows):
        y_pred[k]=NNClassifier(X_train,y_train,X_test[k])[1]
    errRate=1-np.mean(y_pred==y_test)
    cf_prediction["Predicted list"]=y_pred
    cf_prediction["Error Rate"]=errRate
    cf_prediction["Number of errors"]=int(errRate*xtest_rows)
    
    for i in range(0,X_train.shape[0]):
        extrasamples=np.concatenate((X_train,X_test),axis=0)
    for j in range(0,y_train.shape[0]):
        extralabels=np.concatenate((y_train,y_pred),axis=0)
    cfScores=findAllCfScores(X_train,y_train,X_test,y_pred)
    cf_prediction["Average False P-value"]=avgFalsePVal(X_train.shape[0],X_test.shape[0],extrasamples,extralabels,y_test,cfScores)[0]
    cf_prediction["p-values"]=avgFalsePVal(X_train.shape[0],X_test.shape[0],extrasamples,extralabels,y_test,cfScores)[1]
    return cf_prediction​

SyntaxError: invalid character in identifier (<ipython-input-177-dec3fb040402>, line 20)

In [178]:
#prediction of iris dataset
X_train,X_test,y_train,y_test=train_test_split(iris['data'],iris['target'],random_state=3009)
displayResults(X_train,y_train,X_test,y_test)

{'p-values': array([0.10666667, 0.06666667, 0.03333333, 0.05333333, 0.1       ,
        0.06      , 0.02      , 0.02      , 0.02      , 0.12666667,
        0.09333333, 0.04666667, 0.04      , 0.06666667, 0.07333333,
        0.01333333, 0.04666667, 0.11333333, 0.00666667, 0.06      ,
        0.18      , 0.43333333, 0.06      , 0.43333333, 0.56666667,
        0.1       , 0.11333333, 0.11333333, 0.07333333, 0.01333333,
        0.10666667, 0.31333333, 0.15333333, 0.30666667, 0.12      ,
        0.12666667, 0.19333333, 0.04666667, 0.21333333, 0.34      ,
        0.19333333, 0.17333333, 0.1       , 0.16      , 0.33333333,
        0.10666667, 0.02666667, 0.22666667, 0.18      , 0.10666667,
        0.18666667, 0.36      , 0.05333333, 0.24      , 0.20666667,
        0.06      , 0.03333333, 0.23333333, 0.04      , 0.10666667,
        0.66      , 0.12      , 0.16      , 0.23333333, 0.1       ,
        0.00666667, 0.6       , 0.51333333, 0.33333333, 0.18      ,
        0.15333333, 0.8       , 0.19

In [179]:
#ionosphere dataset
#splitting the dataset
train_size=round(ionosphere.shape[0]*0.75)
X_train=np.asarray([ionosphere[i][:-1] for i in range(0,train_size)])
X_test=np.asarray([ionosphere[i][:-1] for i in range(train_size,ionosphere.shape[0])])
y_train=np.asarray([ionosphere[i][-1] for i in range(0,train_size)])
y_test=np.asarray([ionosphere[i][-1] for i in range(train_size,ionosphere.shape[0])])

In [180]:
displayResults(X_train,y_train,X_test,y_test)

{'p-values': array([0.31908832, 0.002849  , 0.18518519, 0.002849  , 0.11396011,
        0.35042735, 0.00854701, 0.18518519, 0.37321937, 0.06837607,
        0.01709402, 0.02279202, 0.00854701, 0.02849003, 0.44444444,
        0.01424501, 0.01709402, 0.11111111, 0.29344729, 0.17663818,
        0.68091168, 0.08262108, 0.34188034, 0.26495726, 0.17948718,
        0.002849  , 0.25925926, 0.29344729, 0.64672365, 0.00569801,
        0.00854701, 0.02849003, 0.46153846, 0.01994302, 0.10826211,
        0.01994302, 0.08262108, 0.27635328, 0.4985755 , 0.01424501,
        0.02279202, 0.05413105, 0.56125356, 0.1025641 , 0.34188034,
        0.05982906, 0.15384615, 0.23931624, 0.34472934, 0.11396011,
        0.11111111, 0.0997151 , 0.37891738, 0.35042735, 0.4017094 ,
        0.16524217, 0.46438746, 0.15384615, 0.71794872, 0.07977208,
        0.23646724, 0.14245014, 0.74074074, 0.03418803, 0.13675214,
        0.00569801, 0.12820513, 0.05128205, 0.04273504, 0.07692308,
        0.35612536, 0.0968661 , 0.17