# This is the jupyter Notebook which will do the final data-analysis. It will do the tasks necessary for Phase 1:

## 1) Give all Trimble readings (ID_track) a score. According to GPS_classification_15.11.txt

## 2) Pick out the best reading (ID_track) to use where multiple readings are available for the same plant ID.

### Input: The data tables for every year from Phase 0/ November/ Version 3

### Output: 
1) The original data tables with the score for each ID_track

2) A data table that containts the best reading (ID_track) for every Sample


@ Author: Harald Ringbauer

In [206]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cPickle as pickle

In [207]:
# A few global Variable names.
# String of the Names of all Input Files:
file_names = ["Antspec2009_IDtrack_DF.csv","Antspec2010_IDtrack_DF.csv","Antspec2011_IDtrack_DF.csv",
              "Antspec2012_IDtrack_DF.csv","Antspec2013_IDtrack_DF.csv","Antspec2014_IDtrack_DF.csv"] 

output_file_name = "Antspec_compl.csv"

input_folder =  "./Input_Data/"    # Name of the Folder used for Input Files
output_folder = "./Output_Data/"

In [208]:
data_tables = []   # List into which all data_tables will go.
all_data = []      # Data Frame for all the Data
headers = []       # List into which all headers will go.


for file_name in file_names:
    path = input_folder + file_name
    print("\nLoading " + path)
    data = np.genfromtxt(path, delimiter = '$', dtype=None)
    print("Shape of Data: ")
    print(data.shape)
    df = pd.read_csv(input_folder + file_name, delimiter="$")     # Create Panda Data Frame
    data_tables.append(df)                                        # Append Panda Data 
    headers.append(list(df))
    
print("\n%i Files successfully loaded!" % len(data_tables))      # Quick Status Update

# Quickly check whether all headers are the same:
for i in range(1,len(headers)):
    equal=np.array_equal(headers[i-1], headers[i])
    if equal == False:
        raise ValueError("Not all headers are equal!")
print("All headers equal!")

print("\nData Format: ")
df = data_tables[0]
print(df.dtypes)

# Concatenate all the data into a single data Frame
print("\nMerging all the data!")
all_data=pd.concat(data_tables, axis=0, ignore_index=1) # Concatenate all Data; Ignore Indexing
print("Nr. of all individuals: %i" % all_data.shape[0])
all_data.reset_index(drop=1)
all_data.reset_index()
all_data.head(3)
#all_data


Loading ./Input_Data/Antspec2009_IDtrack_DF.csv
Shape of Data: 
(1925, 27)

Loading ./Input_Data/Antspec2010_IDtrack_DF.csv
Shape of Data: 
(3119, 27)

Loading ./Input_Data/Antspec2011_IDtrack_DF.csv
Shape of Data: 
(2569, 27)

Loading ./Input_Data/Antspec2012_IDtrack_DF.csv
Shape of Data: 
(3741, 27)

Loading ./Input_Data/Antspec2013_IDtrack_DF.csv
Shape of Data: 
(5758, 27)

Loading ./Input_Data/Antspec2014_IDtrack_DF.csv
Shape of Data: 
(8715, 27)

6 Files successfully loaded!
All headers equal!

Data Format: 
Population                object
Location                  object
VisitID                    int64
PlantID                   object
IDtrack                   object
PhenoCat                 float64
Date                      object
year                       int64
ThisYearLabel              int64
Latitude                 float64
Longitude                float64
Altitude                 float64
CorrectedLatitude        float64
CorrectedLongitude       float64
CorrectedAltitude 

Unnamed: 0,Population,Location,VisitID,PlantID,IDtrack,PhenoCat,Date,year,ThisYearLabel,Latitude,...,DistEastofCentre,DistNorthOfCentre,HorzErr,VertErr,GPS,Comments,Alive,Perennial,FloweringStage,LocationManualUpdater
0,HZ,Unk,2022.0,J0001,1_J0001,,16/06/09,2009,1.0,42.321497,...,-232.275681,-131.158978,5.95776,9.13216,1.0,,Alive,,,
1,HZ,Unk,2023.0,J0002,2_J0002,,16/06/09,2009,1.0,42.321479,...,-233.310219,-133.051083,5.92379,8.82668,1.0,,Alive,,,
2,HZ,Unk,2024.0,J0003,3_J0003,,16/06/09,2009,1.0,42.321529,...,-241.00831,-127.430149,7.18771,9.0873,1.0,,Alive,,,


In [230]:
#### Now do the ID - Track Score for Every single individual:
all_data['Track_Score']=-1 # Set the Default Track Score to 0!
un_corr_df = all_data.loc[:,['Latitude','Longitude','Altitude']]                          # The uncorrected Data
corr_df = all_data.loc[:,['CorrectedLatitude','CorrectedLongitude','CorrectedAltitude']]  # The corrected Data                                  # The corrected Data

# Extract the indices where Nan
nan_corr = corr_df.isnull().any(axis=1)   # Get all indices were at least one entry is not 0.
nan_uncorr = un_corr_df.isnull().any(axis=1)   # Get all indices were at least one entry is not 0.

# Give some Output
print("Nr. of NaNs in corrected: %i" % np.sum(list(nan_corr)))
print("Nr. of NaNs in uncorrected: %i" % np.sum(list(nan_uncorr)))
print("Nr. of NaNs in both: %i" % np.sum(list(both_zero)))

# Some logical magic to find out where one entry is 
both_nan = nan_corr & nan_uncorr
one_nan = ~nan_corr & nan_uncorr   # Corrected Value is not NaN, but uncorrected is!
non_nan = ~nan_corr & ~nan_uncorr  # Both are not NaN

# Extract where the two Position-Frames differ:
diff_pos = (un_corr_df.values != corr_df.values).any(axis=1)  # Numpy Array where GPS positions different
same_pos = np.logical_not(diff_pos)
print("Nr. Entries where GPS Entries are different: %i out of %i" % (np.sum(list(diff_pos)), corr_df.shape[0]))

bd = non_nan & diff_pos
bs = non_nan & same_pos

# Extract where someone did a manual Update:
manual_updater = ~all_data.LocationManualUpdater.isnull()   # Extract all entries where there is a manual Updater
print("Nr. of Entries with manual positon update:  %i out of %i" % (np.sum(list(manual_updater)), corr_df.shape[0]))

# Extract where death or label only:
plant_stat_wrong = (all_data.Alive == "Label only") | (all_data.Alive == "Dead")
print("Nr. of Entries with Death or Label Only:  %i out of %i" % (np.sum(list(plant_stat_wrong)), corr_df.shape[0]))

#print(np.sum(list(one_nan)))
all_data.loc[both_nan,'Track_Score'] = 0  # Set the lowest Score
all_data.loc[one_nan,'Track_Score'] = 1   # Set Score = 1: Corrected is not NaN, but Uncorrected is!
all_data.loc[bs,'Track_Score'] = 2        # Both Entries exist - but are the same
all_data.loc[bd,'Track_Score'] = 3        # High score: Both GPS entries exist and are different

all_data.loc[manual_updater, 'Track_Score'] = 1 # Set all entries where there was an manual Updater to 1.
all_data.loc[plant_stat_wrong, 'Track_Score'] = 0 # Set all entrie where the plant status is wrong to 0.

all_data.to_csv(output_folder + output_file_name, sep = '$')

#Print

Nr. of NaNs in corrected: 1466
Nr. of NaNs in uncorrected: 1792
Nr. of NaNs in both: 1466
Nr. Entries where GPS Entries are different: 25690 out of 25821
Nr. of Entries with manual positon update:  404 out of 25821
Nr. of Entries with Death or Label Only:  3225 out of 25821


## Step 2 of Phase 1.

In [232]:
# First load the Data Frame
df = pd.read_csv(output_folder + output_file_name, sep="$")     # Create Panda Data Frame
print("Successfully loaded")
print("Shape of Data: ")
df.shape

Successfully loaded
Shape of Data: 


(25821, 29)

## Test Area
Created to play around with and try out stuff.

In [79]:
# A test data frame created for playing around:

test_df = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                                      'D' : np.array([3] * 4,dtype='int32'),
                                      'Ea' : pd.Categorical(["test","train","test","train"]),
                                      'Fe' : 'foo' })

In [80]:
test_df

Unnamed: 0,A,B,C,D,Ea,Fe
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [140]:
test_df['G']=0
print(test_df.dtypes)
test_df.loc[:,['A','Ea','G']]
test_df.iloc[2, 3] = np.nan
test_df
test_df.isnull().any(axis=1)

A            float64
B     datetime64[ns]
C            float32
D            float64
Ea          category
Fe            object
G              int64
dtype: object


[False, False, True, False]

In [181]:
test_df = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                                      'D' : np.array([3] * 4,dtype='int32'),
                                      'Ea' : pd.Categorical(["test","train","test","train"]),
                                      'Fe' : 'foo' })
test_df1 = pd.DataFrame({ 'A' : 1, 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                                      'D' : np.array([3] * 4,dtype='int32'),
                                      'Ea' : pd.Categorical(["test","train","test","train"]),
                                      'Fe' : 'foo' })
test_df1.iloc[2,3]=2.0
test_df1

a=(test_df.values!=test_df1.values).any(axis=1)
a

array([False, False,  True, False], dtype=bool)

In [214]:
all_data.LocationManualUpdater

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
        ... 
25791    NaN
25792    NaN
25793    NaN
25794    NaN
25795    NaN
25796    NaN
25797    NaN
25798    NaN
25799    NaN
25800    NaN
25801    NaN
25802    NaN
25803    NaN
25804    NaN
25805    NaN
25806    NaN
25807    NaN
25808    NaN
25809    NaN
25810    NaN
25811    NaN
25812    NaN
25813    NaN
25814    NaN
25815    NaN
25816    NaN
25817    NaN
25818    NaN
25819    NaN
25820    NaN
Name: LocationManualUpdater, dtype: object