In [None]:
import pandas as pd
import numpy as np
import statistics as stat
import random as random
import math as math

# Ear Phenotypes

In [None]:
useCols = [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
df1 = pd.read_csv('../data/earphenotypesformatted.csv', na_values = {'a/n', 'a'}, usecols = useCols)
df2 = pd.read_csv('../data/earphenotypesformatted.csv', na_values = {'a/n', 'a'}, usecols = useCols)
fileList = [df1, df2]
traitList = ['Ear Width', 'Kernel Fill Length', 'Kernel Row Number', 'Kernels per Row', 'Ear Weight', 'Kernel Count', 'Cob Length', 'Cob Width', 'Cob Weight', '100 Kernel weight']

In [None]:
# What if we just concat and pivot
ears = pd.concat([df1, df2])
ears = ears.reset_index(drop=False)
ears = ears.replace({'22.94.':'22.94'})
ears['Kernel Mass'] = ears['Ear Weight'].astype(float) - ears['Cob Weight'].astype(float)
ears['QR Code'].astype(str)
ears['QR Code'] = ears['QR Code'].str.upper()
ears['earNum'] = ears.groupby('QR Code').cumcount()
ears = ears[ears['earNum'] < 4]
ears = ears[~ears['QR Code'].str.contains('INBRED')]
plots = pd.pivot(ears, index = 'QR Code', columns = 'earNum')
traitList.append('Kernel Mass')
plots = plots.astype(float)

In [None]:
plots.dtypes

In [None]:
# Now that we have our nice data frame, we need to calculate the mean and standard error with all possible sets of 4 and all possible sets of 2 for each trait.
index30 = [1, 2, 3]
index31 = [0, 2, 3]
index32 = [0, 1, 2]
index20 = [0, 1]
index21 = [0, 2]
index22= [0, 3]
index23 = [1, 2]
index24= [1, 3]
index25= [2, 3]
index3List = [index30, index31, index32]
index3ListStr = ['30', '31', '32']
index2List = [index20, index21, index22, index23, index24, index25]
index2ListStr = ['20', '21', '22', '23', '24', '25']

results = pd.DataFrame()
for i in traitList:
    mean = i + 'mean' + '4'
    se = i + 'se' + '4'
    meanSeries = (plots[i][[0, 1, 2, 3]]).mean(axis = 1)
    results[mean] = meanSeries
    seSeries = (plots[i][[0, 1, 2, 3]]).sem(axis = 1)
    results[se] = seSeries
    
    for j in range(3):
        mean = i + 'mean' + index3ListStr[j]
        se = i + 'se' + index3ListStr[j]
        index = index3List[j]
        meanSeries = (plots[i][index]).mean(axis = 1)
        seSeries = (plots[i][index]).sem(axis = 1)
        results[mean] = meanSeries
        results[se] = seSeries
        
    for k in range(6):
        mean = i + 'mean' + index2ListStr[k]
        se = i + 'se' + index2ListStr[k]
        index = index2List[k]
        meanSeries = (plots[i][index]).mean(axis = 1)
        seSeries = (plots[i][index]).sem(axis = 1)
        results[mean] = meanSeries
        results[se] = seSeries

In [None]:
meanSeries

In [None]:
results

In [5]:
avgSE = pd.DataFrame()
for t in traitList:
    traitSE2 = t + '.SE.2'
    traitSE3 = t + '.SE.3'
    
    seCols2 = []
    for i in index2ListStr:
        colName = t + 'se' + i
        seCols2.append(colName)
     
    seCols3 = []
    for j in index3ListStr:
        colName = t + 'se' + j
        seCols3.append(colName)
    avgSE[traitSE2] = results[seCols2].mean(axis = 1)
    avgSE[traitSE3] = results[seCols3].mean(axis = 1)
    

In [None]:
avgSE

In [None]:
corrPerSet = pd.DataFrame()

for t in traitList:
    fullMeanCol = t + 'mean' + '4'
    
    for i in index2ListStr:
        colName = t + 'mean' + i
        cor = results[colName].corr(results[fullMeanCol])
        corrPerSet[colName] = pd.Series()
        corrPerSet.at[0, colName] = cor
    for j in index3ListStr:
        colName = t + 'mean' + j
        cor = results[colName].corr(results[fullMeanCol])
        corrPerSet.at[0, colName] = cor
        
    for k in range(4):
        colName = t + '.Corr.' + str(k)
        cor = plots[t][k].corr(results[fullMeanCol])
        corrPerSet[colName] = pd.Series()
        corrPerSet.at[0, colName] = cor
    

In [None]:
corrPerSet.columns

In [11]:
summary = pd.DataFrame({'Set Size':[1, 2, 3, 4]})

for t in traitList:
    sem = t + '.MeanSEM'
    corr = t + '.MeanCorr'
    sem2 = t + '.SE.2'
    sem3 = t + '.SE.3'
    semFull = t + 'se4'
    
    meanCols2 = []
    for i in index2ListStr:
        colName = t + 'mean' + i
        meanCols2.append(colName)
        
    meanCols3 = []
    for j in index3ListStr:
        colName = t + 'mean' + j
        meanCols3.append(colName)
    
    meanCols1 = []
    for k in range(4):
        colName = t + '.Corr.' + str(k)
        meanCols1.append(colName)
        
    # summary.at[1, sem] = avgSE[sem2].mean(axis = 0)
    # summary.at[2, sem] = avgSE[sem3].mean(axis = 0)
    # summary.at[3, sem] = results[semFull].mean(axis = 0)
    
    summary.at[0, corr] = corrPerSet[meanCols1].mean().values[0]
    summary.at[1, corr] = corrPerSet[meanCols2].mean().values[0]
    summary.at[2, corr] = corrPerSet[meanCols3].mean().values[0]
    summary.at[3, corr] = 1

In [12]:
# Export to csv
summary.to_csv('../analysis/RequiredNumberOfMeasurements_EarPhenotypes.csv', index = False)

# NIR

In [13]:
nir = pd.read_excel('../data/NIRData.xlsx', sheet_name = 'in', dtype = {'Sample ID':str})
nir['protein'] = nir['Protein As is'] / (1 - (nir['Moisture'] / 100))
nir['oil'] = nir['Oil As is'] / ( 1 - (nir['Moisture'] / 100))
nir['fiber'] = nir['Fiber As is'] / (1 - (nir['Moisture'] / 100))
nir['ash'] = nir['Ash As is'] / (1 - (nir['Moisture'] / 100))
nir['starch'] = nir['Starch As is'] / (1 - (nir['Moisture'] / 100))
nir['qr'] = nir['Sample ID'].str.upper()
nir['qr'] = nir['qr'].astype(str)
nir = nir.rename(columns = {'Moisture':'moisture'})
traits = ['starch', 'protein', 'oil', 'fiber', 'ash', 'moisture']
nir = nir[['qr', 'starch', 'protein', 'oil', 'fiber', 'ash', 'moisture']]
nir = nir[~nir['qr'].str.contains('SYNGENTA')]
nir = nir[~nir['qr'].str.contains('INBRED')]
numRecordsTotal = nir.shape[0]
nir = nir[(nir['starch'] >= 0) & (nir['protein'] >= 0) & (nir['oil'] >= 0) & (nir['fiber'] >= 0) & (nir['ash'] >= 0) & (nir['moisture'] >= 0)]
numRecordsPosVals = nir.shape[0]
percentNegVals = (numRecordsTotal - numRecordsPosVals)/numRecordsTotal
nir['i'] = nir.groupby('qr').cumcount()
nir = nir[nir['i'] < 3]

nirWide = pd.pivot(nir, index = 'qr', columns = 'i')

In [15]:
percentNegVals

0.008487935006669093

In [16]:
index3 = [0, 1, 2]
index20 = [1, 2]
index21 = [0, 2]
index22 = [0, 1]
index2List = [index20, index21, index22]
index2ListStr = ['20', '21', '22']
nirResults = pd.DataFrame()

for t in traits:
    mean3 = t + '.Mean.3'
    se3 = t + '.SE.3'
    meanSeries = (nirWide[t][index3]).mean(axis = 1)
    seSeries = (nirWide[t][index3]).sem(axis = 1)
    nirResults[mean3] = meanSeries
    nirResults[se3] = seSeries
    
    for i in range(3):
        meanCol = t + '.Mean.' + index2ListStr[i]
        seCol = t + '.SE.' + index2ListStr[i]
        meanSeries = (nirWide[t][index2List[i]]).mean(axis = 1)
        seSeries = (nirWide[t][index2List[i]]).sem(axis = 1)
        nirResults[meanCol] = meanSeries
        nirResults[seCol] = seSeries   


In [17]:
nirAvgSE = pd.DataFrame()

for t in traits:
    colName = t + '.SE.2'
    
    cols = []
    for i in range(3):
        col = t + '.SE.' + index2ListStr[i]
        cols.append(col)
    
    series = nirResults[cols].mean(axis = 1)
    nirAvgSE[colName] = series

In [20]:
nirCorrPerSet = pd.DataFrame()

for t in traits:
    fullMeanCol = t + '.Mean.3'
    
    for i in index2ListStr:
        colName = t + '.Mean.' + i
        cor = nirResults[colName].corr(nirResults[fullMeanCol])
        nirCorrPerSet[colName] = pd.Series()
        nirCorrPerSet.at[0, colName] = cor
        
    for j in range(3):
        colName = t + '.Corr.' + str(j)
        cor = nirWide[t][j].corr(nirResults[fullMeanCol])
        nirCorrPerSet[colName] = pd.Series()
        nirCorrPerSet.at[0, colName] = cor
        

In [21]:
nirSummary = pd.DataFrame({'Set Size': [1, 2, 3]})

for t in traits:
    sem = t + '.MeanSEM'
    corr = t + '.MeanCorr'
    sem2 = t + '.SE.2'
    sem3 = t + '.SE.3'
    
    meanCols = []
    for i in index2ListStr:
        colName = t + '.Mean.' + i
        meanCols.append(colName)
        
    corrCols = []
    for j in range(3):
        colName = t + '.Corr.' + str(j)
        corrCols.append(colName)
    
    # nirSummary.at[1, sem] = nirAvgSE[sem2].mean()
    # nirSummary.at[2, sem] = nirResults[sem3].mean()
    
    nirSummary.at[0, corr] = nirCorrPerSet[corrCols].mean().values[0]
    nirSummary.at[1, corr] = nirCorrPerSet[meanCols].mean().values[0]
    nirSummary.at[2, corr] = 1

In [22]:
# Export as csv
nirSummary.to_csv('../analysis/RequiredNumberOfMeasurements_NIR.csv', index = False)