# Manual Changes

## template mapping files are in the git repository
## original data in _CyVerse Discovery Environment_ 
### data file is: "J.Biogeo.2008.AllData.Final.csv"

### _catalogNumber_
- in Specimen.Number column (new catalogNumber)
- separate out institutionCode from Specimen.Number
- create new column titled institutionCode

### _measurementUnit_
- either in "g" or "mm"

### _otherCatalogNumbers_
- concatenated list of:
    - Proxy.Specimen.Number
    - Annual.Specimen.Number
    - YOC.Specimen.Number

### _unused columns_
- datum (units for latitude)

## To Code
### _elevationInMeters_
- in _elevation.ft_
- convert ot meters

In [1]:
import pandas as pd

In [2]:
#Import Biogeo Data
biogeo = pd.read_csv("biogeo.csv")

In [3]:
#Make necessary conversions

#Convert elevation.ft values from feet to meters
#1 foot is exactly 0.3048 meters
biogeo['elevation.ft']=biogeo['elevation.ft'].multiply(0.3048)

#Add measurementUnit column 
biogeo=biogeo.assign(measurementUnit = "")

biogeo

Unnamed: 0,Specimen.Number,Proxy.Specimen.Number,Annual.Specimen.Number,YOC.Specimen.Number,dec.lat,dec.long,max.error,datum,elevation.ft,c.diastema.1.mm,...,spr.max.c,spr.min.c,spr.precip.in,sum.max.c,sum.min.c,sum.precip.in,win.max.c,win.min.c,win.precip.in,measurementUnit
0,MVZ 100739,MVZ 100739,,,36.458730,-121.234230,0.16089,NAD27,457.20,8.17,...,,,,,,,,,,
1,MVZ 100740,MVZ 100740,,,35.328304,-119.845250,4.02300,NAD27,822.96,7.52,...,,,,,,,,,,
2,MVZ 101240,,MVZ 101240,MVZ 101240,37.850522,-122.536923,1.55600,NAD27,,8.20,...,64.13000,46.70667,2.106667,71.27667,52.82667,0.103333,58.02667,44.04667,4.473333,
3,MVZ 101332,,MVZ 101332,MVZ 101332,38.107071,-122.841182,0.65000,NAD27,,9.78,...,67.57667,40.95000,1.956667,79.35000,47.49000,0.093333,60.95667,38.16333,4.840000,
4,MVZ 101333,MVZ 101333,MVZ 101333,MVZ 101333,38.119278,-122.821322,2.17580,NAD27,45.72,9.52,...,67.57667,40.95000,1.956667,79.35000,47.49000,0.093333,60.95667,38.16333,4.840000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,MVZ 99936,MVZ 99936,,,33.925100,-116.681400,1.00000,NAD27,365.76,9.20,...,,,,,,,,,,
284,MVZ 99937,MVZ 99937,,,33.925100,-116.681400,1.00000,NAD27,365.76,8.52,...,,,,,,,,,,
285,MVZ 99938,MVZ 99938,,,33.925100,-116.681400,1.00000,NAD27,365.76,7.86,...,,,,,,,,,,
286,MVZ 99940,MVZ 99940,,,33.877686,-116.621661,1.60900,NAD27,335.28,8.10,...,,,,,,,,,,


In [73]:
#Rearrange columns so that template columns are first, followed by measurement values

#Create column list
cols = biogeo.columns.tolist()

#Specify desired columns
cols = ['Specimen.Number',
        'dec.lat',
        'dec.long',  
        'max.error',
        'datum',
        'elevation.ft',
        'ear.length.mm',
        'hind.foot.length.mm',
        'tail.length.mm',
        'total.length.mm',
        'body.mass.g',
        'measurementUnit']

#Subset dataframe
biogeo = biogeo[cols]

In [74]:
#Matching template and column terms

#Renaming columns 
biogeo = biogeo.rename(columns = {'Specimen.Number':'individualID', 
                                  'dec.lat':'decimalLatitude', 
                                  'dec.long':'decimalLongitude',  
                                  'max.error':'coordinateUncertaintyInMeters', 
                                  'elevation.ft':'pointElevationInMeters'})

In [75]:
#Matching trait and ontology terms

#Renaming columns
biogeo = biogeo.rename(columns={'ear.length.mm':'ear length',
                                'hind.foot.length.mm':'hind foot length',
                                'tail.length.mm': 'tail length',
                                'total.length.mm':'full body length',
                                'body.mass.g':'body mass'})

In [76]:
#create long version so that each trait has its own row

#creating long version, first specifiying keep variables, then naming variable and value
longVers=pd.melt(biogeo, 
                id_vars=['individualID',
                'decimalLatitude',
                'decimalLongitude',  
                'coordinateUncertaintyInMeters',
                'datum',
                'pointElevationInMeters',
                'measurementUnit'], 
                var_name = 'trait', 
                value_name = 'measurement')

#Populating measurementUnit column with appropriate measurement units in long version
for ind in longVers.index:
    if longVers['trait'][ind] == "body mass":
        longVers['measurementUnit'][ind]="g"
    else:
        longVers['measurementUnit'][ind]="mm"
    
#Writing long data csv file
longVers.to_csv('Biogeo_Data_Long.csv');

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,individualID,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,datum,pointElevationInMeters,measurementUnit,trait,measurement
0,MVZ 100739,36.458730,-121.234230,0.16089,NAD27,457.20,mm,ear length,-
1,MVZ 100740,35.328304,-119.845250,4.02300,NAD27,822.96,mm,ear length,
2,MVZ 101240,37.850522,-122.536923,1.55600,NAD27,,mm,ear length,
3,MVZ 101332,38.107071,-122.841182,0.65000,NAD27,,mm,ear length,
4,MVZ 101333,38.119278,-122.821322,2.17580,NAD27,45.72,mm,ear length,
...,...,...,...,...,...,...,...,...,...
1435,MVZ 99936,33.925100,-116.681400,1.00000,NAD27,365.76,g,body mass,
1436,MVZ 99937,33.925100,-116.681400,1.00000,NAD27,365.76,g,body mass,
1437,MVZ 99938,33.925100,-116.681400,1.00000,NAD27,365.76,g,body mass,
1438,MVZ 99940,33.877686,-116.621661,1.60900,NAD27,335.28,g,body mass,
