# Example 2 (November 2019)
## Outline of Topics
 - Quick review -- NumPy
 - Pandas
 - Some real data ...

## The NumPy Package

In [1]:
# NumPy (Numerical Python): provides an efficient interface to store and compute on 
# dense data buffers. NumPy arrays are much more efficient than Python's built-in list 
# data type. Numpy arrays must all be of the same type. If types do not match, NumPy 
# will upcast if possible. Can explicitely set the type in a numpy array .... 
# dtype = 'float32' ... etc
#
# NumPy arrays can be multi-dimensional in contrast to lists.
#
# Some useful features of NumPy are: np.zeros, np.ones, np.full, np.arrange, np.linspace, 
# np.random.random, np.random.normal, np.random.randint, np.eye, etc.
#
# Also Matplotlib for plotting.

import numpy as np
import sys
import matplotlib as mpl
import matplotlib.pyplot as plt


## The Pandas Package

In [2]:
# Pandas is built on top of NumPy to create an efficient implementation of
# a DataFrame ... which is a multidimensional array with attached row and
# column labels. Mimics the way data is handled in R and popular spreadsheets.
# Useful or making pivot tables and other summary groupings.
#
# There is also a Series object worth using.

In [3]:
import pandas as pd

In [4]:
# Read a csv file from local memory. The path given in the command is relative to
# the location of the current working directory (i.e., where this version of
# Example-Day2.ipynb is running.
#
# './data/case-290-gps-fs10.csv'
#  ^   ^          ^
#  |   |          |
#  |   subdir     file name
#  |
#  current working directory

df290 = pd.read_csv('./data/case-290-gps-fs10.csv')

In [5]:
print(type(df290))

<class 'pandas.core.frame.DataFrame'>


In [6]:
# Several ways to peak at the data we've just read ...

df290.head()

Unnamed: 0,gpsTime,lat,lon,altitude,speed,bearing,accuracy
0,1530997640000,40.778404,-102.368275,1152.900024,3.0,77.5,3
1,1530997641000,40.778415,-102.368247,1153.199951,3.0,65.9,3
2,1530997642000,40.778433,-102.368226,1152.699951,3.0,49.9,3
3,1530997643000,40.778456,-102.36821,1152.599976,3.0,36.1,3
4,1530997644000,40.778481,-102.368201,1152.099976,3.0,24.8,3


In [7]:
# Change the default number of rows ...

df290.head(10)

Unnamed: 0,gpsTime,lat,lon,altitude,speed,bearing,accuracy
0,1530997640000,40.778404,-102.368275,1152.900024,3.0,77.5,3
1,1530997641000,40.778415,-102.368247,1153.199951,3.0,65.9,3
2,1530997642000,40.778433,-102.368226,1152.699951,3.0,49.9,3
3,1530997643000,40.778456,-102.36821,1152.599976,3.0,36.1,3
4,1530997644000,40.778481,-102.368201,1152.099976,3.0,24.8,3
5,1530997645000,40.778507,-102.368197,1152.699951,2.75,16.1,3
6,1530997646000,40.778533,-102.368197,1152.300049,2.75,10.2,3
7,1530997647000,40.778558,-102.368199,1152.800049,2.75,6.7,3
8,1530997648000,40.77858,-102.368202,1152.699951,2.75,5.8,3
9,1530997649000,40.778604,-102.368203,1153.5,2.75,6.3,3


In [8]:
# What if I want to look at the end ... ?

df290.tail()

Unnamed: 0,gpsTime,lat,lon,altitude,speed,bearing,accuracy
8227,1531010458000,40.778583,-102.36424,1156.0,2.5,200.4,3
8228,1531010459000,40.778564,-102.364252,1156.199951,2.5,204.8,3
8229,1531010460000,40.778547,-102.364267,1156.099976,2.5,208.9,3
8230,1531012947000,40.778424,-102.368198,1155.699951,8.0,269.8,3
8231,1531012948000,40.778424,-102.368297,1156.0,8.5,269.5,3


In [9]:
# In the version of panda we are here running, we can see both head and tail
# by simply typing the dataframe name ...

df290

Unnamed: 0,gpsTime,lat,lon,altitude,speed,bearing,accuracy
0,1530997640000,40.778404,-102.368275,1152.900024,3.0,77.5,3
1,1530997641000,40.778415,-102.368247,1153.199951,3.0,65.9,3
2,1530997642000,40.778433,-102.368226,1152.699951,3.0,49.9,3
3,1530997643000,40.778456,-102.368210,1152.599976,3.0,36.1,3
4,1530997644000,40.778481,-102.368201,1152.099976,3.0,24.8,3
...,...,...,...,...,...,...,...
8227,1531010458000,40.778583,-102.364240,1156.000000,2.5,200.4,3
8228,1531010459000,40.778564,-102.364252,1156.199951,2.5,204.8,3
8229,1531010460000,40.778547,-102.364267,1156.099976,2.5,208.9,3
8230,1531012947000,40.778424,-102.368198,1155.699951,8.0,269.8,3


In [10]:
# Refer to any particular column by reference to its column label.
# For example ...

df290.gpsTime

0       1530997640000
1       1530997641000
2       1530997642000
3       1530997643000
4       1530997644000
            ...      
8227    1531010458000
8228    1531010459000
8229    1531010460000
8230    1531012947000
8231    1531012948000
Name: gpsTime, Length: 8232, dtype: int64

In [11]:
# The "values" attribute creates a NumPy array from the dataframe ...

df290.values

array([[ 1.53099764e+12,  4.07784043e+01, -1.02368275e+02, ...,
         3.00000000e+00,  7.75000000e+01,  3.00000000e+00],
       [ 1.53099764e+12,  4.07784146e+01, -1.02368247e+02, ...,
         3.00000000e+00,  6.59000000e+01,  3.00000000e+00],
       [ 1.53099764e+12,  4.07784328e+01, -1.02368226e+02, ...,
         3.00000000e+00,  4.99000000e+01,  3.00000000e+00],
       ...,
       [ 1.53101046e+12,  4.07785467e+01, -1.02364267e+02, ...,
         2.50000000e+00,  2.08900000e+02,  3.00000000e+00],
       [ 1.53101295e+12,  4.07784238e+01, -1.02368198e+02, ...,
         8.00000000e+00,  2.69800000e+02,  3.00000000e+00],
       [ 1.53101295e+12,  4.07784236e+01, -1.02368297e+02, ...,
         8.50000000e+00,  2.69500000e+02,  3.00000000e+00]])

In [12]:
# Check its type ...

type(df290.values)

numpy.ndarray

In [13]:
df290.values.shape

(8232, 7)

In [14]:
df290.values[1,3]

1153.19995117188

In [15]:
# Here is a way to refer to a single row of the dataframe ...

df290.loc[2]

gpsTime     1.530998e+12
lat         4.077843e+01
lon        -1.023682e+02
altitude    1.152700e+03
speed       3.000000e+00
bearing     4.990000e+01
accuracy    3.000000e+00
Name: 2, dtype: float64

In [None]:
type(df290.loc[2])

In [None]:
# Here is another way ...

df290.values[2]

In [None]:
type(df290.values[2])

## Now Look Harder at the Data

* GPS Time

In [None]:
# What sort of data are they ... ?

type(df290.gpsTime[0])

In [None]:
# But they are big integers ...

df290.gpsTime[0]

In [None]:
# Let's look at a few of them again ...

df290.gpsTime

* Note that the GPS time stamps seem to increase by 1000 at each step.
* Since most GPS receivers are reporting once per second --> 1000 corresponds to 1 sec.
* That is, the smallest unit in the gpsTime value is a millisecond.
* Thus, each tick of the gpsTime "clock" would then represent

        1/(1000 * 3600 * 24 * 365) years.

In [None]:
# If so,the clock above started at df.gpsTime[0]/(1000*3600*24*364) years ago:

df290.gpsTime[0]/(1000*3600*24*365)

## In the beginning the world was ....
* and the beginning was January 1, 1970 at 12 am somewhere
* The wheat harvest data was taken in July of 2018 ... about 48.5 years later!

In [None]:
# The total time that a clock time representing a 64 bit integer could "hold" before
# wrapping would be (in years):

(2 ** 32)/(3600*24*365)

# Which is our next Y2K worry. Perhaps there are more important things to ponder
# today?

In [None]:
# The elapsed time (in hours) of the grain cart log file is:

round((max(df290.gpsTime) - min(df290.gpsTime))/(1000*3600), 2)

In [None]:
# Let's take a look at how gpsTime evolves in the cart log file as
# a check ...

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df290.gpsTime)
plt.grid()

Notes:

* What's up with the units on the axes? Nearly unreadable.
* Something is wrong with the log file since time is jumping.

In [None]:
# How to fix the odd units and make the plot axis make more sense ...

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(round((df290.gpsTime - min(df290.gpsTime))/(1000*3600), 4))
plt.title("Elapsed Time in Case 290 Log")
plt.xlabel("Observation Number")
plt.ylabel("Time in Hours")
plt.grid()

Notes:

* Certainly looks better and axes are readable.
* Subtracting the minimum timestamp from every entry starts the elapsed time at zero
* Round was used to strip off superfluous digits. 
    - "4" was used since 1/3600 of an hour is the basic time increment
    - Try using 3, 2, 1 to see what happens.
* From the above plot can see that:
    - the logger must have been turned off at various times because of the jumps.
    - Might be interesting to see where the tractor was when these events occurred.


In [None]:
# Look at the speed column

df290.speed

What do we think the units are?

* Miles per hour? Feet per second? Meters per second? Parsecs per day?
* Okay ... it's meters per second 2.5 m/s is about 5.5 miles/hour

"You've never heard of the Millennium Falcon?
... It's the ship that made the Kessel Run in less than twelve parsecs."

![falcon](figures/Falcon.jpeg)

(By the way a parsec is the distance at which the earth's orbit subtends an angle of 1 second of arc)

In [None]:
# What do the speeds look like over time?

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df290.speed)
plt.title("Speed in Case 290 Log")
plt.xlabel("Observation Number")
plt.ylabel("Speed in Meters per second")
plt.grid()

In [None]:
# What are the values in our data set?

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.hist(df290.speed, bins=30)
plt.title("Histogram of Speeds in Case 290 Log")
plt.xlabel("Speed in m/s")
plt.ylabel("Number of observations")
plt.grid()


## How about some combine tracks?

In [None]:
df7130 = pd.read_csv('./data/case-7130-gps-fs10.csv')

In [None]:
df7130

* Hmmm. There are many more rows in this log.

In [None]:
%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(round((df7130.gpsTime - min(df7130.gpsTime))/(1000*3600), 4))
plt.title("Elapsed Time in Case 7130 Log")
plt.xlabel("Observation Number")
plt.ylabel("Time in Hours")
plt.grid()

Notes:

* Elapsed times of the two logs are about the same (machines are working the same field)
* Smooth linear increase is what one would expect for time flow.

In [None]:
# Speed time series ...

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df7130.speed)
plt.title("Speed in Case 7130 Log")
plt.xlabel("Observation Number")
plt.ylabel("Speed in Meters per second")
plt.grid()

In [None]:
# Speed histogram

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.hist(df7130.speed, bins=30)
plt.title("Histogram of Speeds in Case 7130 Log")
plt.xlabel("Speed in m/s")
plt.ylabel("Number of observations")
plt.grid()

In [None]:
# How to show two plots side by side so that we can easily compare them ... ?

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
fig.subplots_adjust(wspace=1.0)

plt.subplot(1, 2, 1)
plt.hist(df7130.speed, bins=30)
plt.title("7130 Speeds")
plt.xlabel("Speed in m/s")
plt.ylabel("Number of observations")
plt.grid()

plt.subplot(1, 2, 2)
plt.hist(df290.speed, bins=30)
plt.title("290 Speeds")
plt.xlabel("Speed in m/s")
plt.ylabel("Number of observations")
plt.grid()



Notes:

* Grain carts operate over a larger speed range than do combines (check!)
* Grain carts look to spend more time sitting (check!)
* Average speed of grain carts is higher than combines working in the field (yeah!)

In [None]:
# Try some scatterplots

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df7130.lon, df7130.lat, '.', color='black')
plt.title("7130 Points")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.axis("equal")
plt.grid()

In [None]:
%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df290.lon, df290.lat, '.', color='black')
plt.title("290 Points")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.axis("equal")
plt.grid()

In [None]:
# Clearly having a little trouble with our logger. This was also evident
# in the time file ...

In [None]:
# A "poor man's" map projection: A degree of lattitude is about 69 miles 
# everywhere on the earth. At 40 degrees north lattitude, a degree of longitude
# is about 53 miles. Therefore, at 40 degrees north lattitude, the ratio is
# about

# Lat/Lon = 69/53 .... scale

%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df290.lon, (53/69)*df290.lat, '.', color='black')
plt.title("290 Points")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.axis("equal")
plt.grid()

In [None]:
%matplotlib inline
fig = plt.figure()
plt.style.use('classic')
plt.plot(df7130.lon, (53/69)*df7130.lat, '.', color='black')
plt.title("7130 Points")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.axis("equal")
plt.grid()