In [91]:
import numpy as np
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt

In [92]:
HumanObservedFeaturesData = "HumanObserved-Features-Data/HumanObserved-Features-Data.csv"
observedDiff = "HumanObserved-Features-Data/diffn_pairs.csv"
observedSame = "HumanObserved-Features-Data/same_pairs.csv"

TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10

In [93]:
#Build dictionary (image_id --> attribute array)
def buildDataDictionary():
    with open(HumanObservedFeaturesData) as csv_File:
        csv_reader = csv.reader(csv_File)
        
        #step over the column names
        next(csv_reader)
        
        return {rows[1]:(rows[2], rows[3], rows[4], rows[5], rows[6], rows[7], rows[8], rows[9], rows[10]) for rows in csv_reader}

    

#returns tables of raw concatinated data  
def buildRawConcatenationTables(dataDict):
    diffMatrix = []
    sameMatrix = []
    #Build matrix for different writers
    with open(observedDiff) as csv_diff:
        diff_reader = csv.reader(csv_diff)
        
        #step over the column names
        next(diff_reader)
        
        for row in diff_reader:
            diffRow = []
            diffRow.append(row[0])
            diffRow.append(row[1])
            for element in dataDict[row[0]]:
                diffRow.append(element)
            for element in dataDict[row[1]]:
                diffRow.append(element)
            diffMatrix.append(diffRow)
     
    #Build matrix for same writers
    with open(observedSame) as csv_same:
        same_reader = csv.reader(csv_same)
        
        #step over the column names
        next(same_reader)
        
        for row in same_reader:
            sameRow = []
            sameRow.append(row[0])
            sameRow.append(row[1])
            for element in dataDict[row[0]]:
                sameRow.append(element)
            for element in dataDict[row[1]]:
                sameRow.append(element)
            sameMatrix.append(sameRow)
            
    return (diffMatrix, sameMatrix)
    
#Build Training Data: Parameter: tuple of different and same handwriters (diff, same)
#Returns training matrix and target array in tuple (training matrix, target array) as numpy arrays
def buildTrainingData(dataTuple):
    diffMatrix = np.array(dataTuple[0])
    sameMatrix = np.array(dataTuple[1])
    trainingTarget = []
    
    T_lenSame = int(math.ceil(len(sameMatrix)*0.01*TrainingPercent))
    T_lenDiff = int(math.ceil(len(diffMatrix)*0.01*TrainingPercent))
    
    #Create Target Array
    for x in range(T_lenSame):
        trainingTarget.append(1)
    for x in range(T_lenDiff):
        trainingTarget.append(0)
        
    trainingSame = sameMatrix[0:T_lenSame]
    trainingDiff = diffMatrix[0:T_lenDiff]
    
    trainingMatrix = np.concatenate((trainingSame, trainingDiff))
    
    return (trainingMatrix, np.array(trainingTarget), (len(trainingDiff), len(trainingSame)))


#build validation data: Parameters: different and same handwriters (diff, same), percent of val, # of training rows tuple (diff, same)
#Returns validation matrix and target array in tuple (training matrix, target array) as numpy arrays
def buildValData(rawDataTuple, ValPercent, TrainingDiffCount, TrainingSameCount):
    diffMatrix = np.array(rawDataTuple[0])
    sameMatrix = np.array(rawDataTuple[1])
    valTarget = []
    
    valSizeSame = int(math.ceil(len(sameMatrix)*ValPercent*0.01))
    valSizeDiff = int(math.ceil(len(diffMatrix)*ValPercent*0.01))
    
    #Create Target Array
    for x in range(valSizeSame):
        valTarget.append(1)
    for x in range(valSizeDiff):
        valTarget.append(0)
    
    valSame = sameMatrix[TrainingSameCount:(TrainingSameCount+valSizeSame)]
    valDiff = diffMatrix[TrainingDiffCount:(TrainingDiffCount+valSizeDiff)]
    
    valMatrix = np.concatenate((valSame, valDiff))
    
    return (valMatrix, np.array(valTarget), (len(valDiff), len(valSame)))


#def buildTestData():


def buildTestData(rawDataTuple, ValDiffCount, ValSameCount, TrainingDiffCount, TrainingSameCount):
    diffMatrix = np.array(rawDataTuple[0])
    sameMatrix = np.array(rawDataTuple[1])
    testTarget = []
    
    testSame = sameMatrix[(TrainingSameCount+ValSameCount):]
    testDiff = diffMatrix[(TrainingDiffCount+ValDiffCount):]
    
    #Create Target Array
    for x in range(len(testSame)):
        testTarget.append(1)
    for x in range(len(testDiff)):
        testTarget.append(0)
    
    testMatrix = np.concatenate((testSame, testDiff))
    
    return (testMatrix, np.array(testTarget))

In [94]:
#Get dictionary of images and attributes
dataDict = buildDataDictionary()

#Get observed concatinated tables as tuple (different, same) 
observedRawConcatTables = buildRawConcatenationTables(dataDict)


## Prepare Training Data

In [95]:
#build training data and return training matrix and target data

#Observed Concatination Data
observedTrainingConcatData = buildTrainingData(observedRawConcatTables)

observedTrainingConcatMatrix = observedTrainingConcatData[0]
observedTrainingConcatTarget = observedTrainingConcatData[1]

observedTrainingConcatCounts = observedTrainingConcatData[2]

print("Observed Concatenation")
print("\tTraining Matrix: " + str(observedTrainingConcatMatrix.shape))
print("\tTarget Array: " + str(observedTrainingConcatTarget.shape))

Observed Concatenation
	Training Matrix: (235059, 20)
	Target Array: (235059,)


## Prepare Validation Data

In [96]:
# Build Validation Data

#Observed Concatination Data
observedValConcatData = buildValData(observedRawConcatTables, ValidationPercent, observedTrainingConcatCounts[0], observedTrainingConcatCounts[1])

observedValConcatMatrix = observedValConcatData[0]
observedValConcatTarget = observedValConcatData[1]
observedValConcatCounts = observedValConcatData[2]

print("Observed Concatenation")
print("\tValidation Matrix: " + str(observedValConcatMatrix.shape))
print("\tValidation Array: " + str(observedValConcatTarget.shape))

Observed Concatenation
	Validation Matrix: (29384, 20)
	Validation Array: (29384,)


## Prepare Testing Data

In [97]:
#Build Testing Data

observedTestConcatData = buildTestData(observedRawConcatTables, observedValConcatCounts[0], observedValConcatCounts[1],
                                       observedTrainingConcatCounts[0], observedTrainingConcatCounts[1])

observedTestConcatMatrix = observedTestConcatData[0]
observedTestConcatTarget = observedTestConcatData[1]

print("Observed Concatenation")
print("\tTest Matrix: " + str(observedTestConcatMatrix.shape))
print("\tTest Array: " + str(observedTestConcatTarget.shape))

Observed Concatenation
	Test Matrix: (29380, 20)
	Test Array: (29380,)
