# Trip data exploration

## Class retrieving the data from the C++ server program

In [None]:
import numpy
import pandas

class DataReader:
    def __init__( self, cppToPythonPipeName, pythonToCppPipeName ):
        self.__cppToPythonPipeName = cppToPythonPipeName
        self.__pythonToCppPipeName = pythonToCppPipeName
        return

    def exit( self ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "exit" )
        fo.close()
        return

    def driverList( self ):
        result = []
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "drivers" )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result.append( int(fi.readline()) )
        fi.close()
        return result
        
    def tripList( self, driverId ):
        result = []
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "trips " + str(driverId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result.append( int(fi.readline()) )
        fi.close()
        return result

    def rawData( self, driverId, tripId ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "rawdata " + str(driverId) + " " + str(tripId) )
        fo.close()
        result = numpy.array([]).reshape([0,2])
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            data = fi.readline().strip().split()
            result = numpy.vstack((result, numpy.array([float(data[0]),float(data[1])]) ))
        fi.close()
        return result
        
    def segmentData( self, driverId, tripId ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "segments " + str(driverId) + " " + str(tripId) )
        fo.close()
        sresult = []
        fi = open( self.__cppToPythonPipeName, 'r' )
        lines = fi.readlines()
        fi.close()
        nsegs = int(lines[0].strip())
        i = 1
        for s in range(nsegs):
            result = numpy.array([]).reshape([0,2])
            n = int(lines[i].strip())
            i += 1
            for ipoint in range(n):
                data = lines[i].strip().split()
                i += 1
                point = numpy.array([float(data[0]),float(data[1])])
                result = numpy.vstack((result, point ))
            sresult.append( result )
        return sresult    

    def accelerationValues( self, driverId, tripId ):
        result = []
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "acceleration " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result.append( float(fi.readline()) )
        fi.close()
        return result
        
    def travelDuration( self, driverId, tripId ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "travelDuration " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        v = float( fi.readline() )
        fi.close()
        return v
        
    def travelLength( self, driverId, tripId ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "travelLength " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        v = float( fi.readline() )
        fi.close()
        return v
        
    def distanceOfEndPoint( self, driverId, tripId ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "distanceOfEndPoint " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        v = float( fi.readline() )
        fi.close()
        return v
        
    def speedValues( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,1])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "speed " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.vstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result

    def fftValues( self, driverId, tripId ):
        result = numpy.array([])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "fft " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.hstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result


    def fftDirectionValues( self, driverId, tripId ):
        result = numpy.array([])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "fft_direction " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.hstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result


    def accelerationValues( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,1])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "acceleration " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.vstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result

    def directionValues( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,1])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "direction " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.vstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result

    def speedAccelerationDirectionValues( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,3])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "speedAccelerationDirection " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            line = fi.readline().strip().split()
            line = numpy.array([float(line[0]),float(line[1]),float(line[2])])
            result = numpy.vstack( ( result, line ) )
        fi.close()
        return result

    def speedQuantiles( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,1])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "speedQuantiles " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.vstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result

    def accelerationQuantiles( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,1])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "accelerationQuantiles " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.vstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result

    def directionQuantiles( self, driverId, tripId ):
        result = numpy.array([]).reshape([0,1])
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "directionQuantiles " + str(driverId) + " " + str(tripId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        n = int( fi.readline() )
        for i in range(n):
            result = numpy.vstack( ( result, float(fi.readline() ) ) )
        fi.close()
        return result

    def allTripMetrics( self ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "allTripMetrics" )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        columnNames = fi.readline().strip().split()
        nVariables = len(columnNames)
        n = int( fi.readline() )
        result = numpy.zeros((n,nVariables))
        for i in range(n):
            line = fi.readline().strip().split();
            for j in range(nVariables):
                result[i,j] = float(line[j])
        fi.close()
        return result, columnNames

    def driverTripMetrics( self, driverId ):
        fo = open( self.__pythonToCppPipeName, 'w' )
        fo.write( "driverTripMetrics " + str(driverId) )
        fo.close()
        fi = open( self.__cppToPythonPipeName, 'r' )
        columnNames = fi.readline().strip().split()
        nVariables = len(columnNames)
        n = int( fi.readline() )
        result = numpy.zeros((n,nVariables))
        for i in range(n):
            line = fi.readline().strip().split();
            for j in range(nVariables):
                result[i,j] = float(line[j])
        fi.close()
        return result, columnNames



## A function to plot the raw data

In [None]:
# Gets the angle from y to x
def angleOfVectors( x, y ):
    mx = numpy.sqrt(x[0]**2 + x[1]**2)
    if mx == 0:
        return 0
    my = numpy.sqrt(y[0]**2 + y[1]**2)
    if my == 0:
        return 0
    mxy = mx * my
    sint = ( x[0]*y[1] - x[1]*y[0] ) / mxy
    cost = ( x[0]*y[0] + x[1]*y[1] ) / mxy
    if sint > 1: sint = 1
    if sint < -1: sint = -1
    if cost >= 0:
        return numpy.arcsin( sint )
    else:
        if sint > 0:
            return numpy.pi - numpy.arcsin( sint )
        else:
            if cost > 1: cost=1
            if cost < -1: cost=-1
            return -( numpy.arccos( cost ) )


# Function for plotting the raw data of the segments
def plotSegmentData( segments ):
    
    tripData = numpy.array([]).reshape([0,2])
    tAngles = numpy.array([])
    tSpeed = numpy.array([])
    tAcceleration = numpy.array([])
    
    for segment in segments:
        speedVectors = numpy.diff( segment, axis = 0 )
        speedValues = 3.6 * numpy.apply_along_axis( lambda x: numpy.sqrt( x[0]**2+x[1]**2), 1, speedVectors )
        if len(speedValues) > 1:
            accelerationValues = numpy.diff( speedValues ) / 3.6
            angles = numpy.zeros( len(speedVectors) - 1 )
            for i in range(len(angles)):
                v1 = speedVectors[i]
                v2 = speedVectors[i+1]
                angles[i] = angleOfVectors(v1,v2) * 180 / numpy.pi
            tAngles = numpy.hstack( (tAngles, angles) )
            tAcceleration = numpy.hstack( (tAcceleration, accelerationValues) )

        tSpeed = numpy.hstack( (tSpeed, speedValues) )
        tripData = numpy.vstack( (tripData, segment) )

    # Draw the raw data
    plt.figure( figsize = (10,10), facecolor='lightblue')
    ax = plt.subplot(321)
    ax.plot( tripData[0:,0], tripData[0:,1], 'r.', alpha = 0.1 )
    ax.grid(True)
    ax = plt.subplot(322)
    ax.plot( numpy.arange(0,len(tripData)), tripData[0:,1], 'b.', alpha = 0.1 )
    ax.grid(True)
    ax = plt.subplot(323)
    ax.plot( tripData[0:,0], numpy.arange(0,len(tripData)), 'b.', alpha = 0.1 )
    ax.grid(True)
    ax = plt.subplot(324)
    ax.plot( numpy.arange(0,len(tAngles)), tAngles, 'r-' )
    ax.plot( numpy.arange(0,len(tAngles)), tAngles, 'r.' )
    ax.grid(True)
    ax = plt.subplot(325)
    ax.plot( numpy.arange(0,len(tSpeed)), tSpeed, 'g-' )
    ax.plot( numpy.arange(0,len(tSpeed)), tSpeed, 'g.', alpha=0.15 )
    ax.grid(True)
    ax = plt.subplot(326)
    ax.plot( numpy.arange(0,len(tAcceleration)), tAcceleration, 'b-' )
    ax.plot( numpy.arange(0,len(tAcceleration)), tAcceleration, 'b.', alpha=0.15 )
    ax.grid(True)
    return
    

## Function selecting the middle x% of the data for a given column

In [None]:
def selectVariable(data, column, percentage = 99 ):
    percentile = (100 - percentage) / 2
    cut_low = numpy.percentile(data[0:,column], percentile )
    cut_high = numpy.percentile(data[0:,column], 100-percentile)
    selection = data[(data[0:,column] > cut_low) & (data[0:,column]<cut_high)][0:,column]
    return selection

## Function for rebinning a histogram

In [None]:
def reBin( originalBins, numberOfBins ):
    lowEdge = originalBins[0]
    highEdge = originalBins[-1]
    binSize = ( highEdge - lowEdge ) / numberOfBins
    result = []
    for i in range(numberOfBins):
        result.append( lowEdge + i * binSize)
    result.append(highEdge)
    return result

## Function for normalising an array after removing the outliars

In [None]:
def normaliseData( x ):
    orderedX = numpy.sort(x[ ~ numpy.isnan( x ) ])
    orderedX = orderedX[0.0025*len(orderedX) : 0.9975*len(orderedX)]
    m = numpy.mean( orderedX )
    s = numpy.std( orderedX )
    minValue = orderedX[0]
    maxValue = orderedX[-1]
    result = numpy.zeros(len(x))
    for i in range(len(result)):
        if ( numpy.isnan(x[i]) or ( x[i] < minValue ) or ( x[i] > maxValue ) ):
            result[i] = numpy.nan
        else:
            result[i] = ( x[i] - m ) / s 
    return result

## Function for performing a Primary Component Analysis (on normalised data)

In [None]:
def findPCA( x ):
    (nSamples, dimensions) = x.shape
    # Calculate mean vector
    meanVector = numpy.apply_along_axis( numpy.mean, 0, x )
    # Calculate covariance matrix from scatted matrix
    scatter_matrix = numpy.zeros((dimensions,dimensions))
    for i in range(nSamples):
        d = ( x[i,:] - meanVector ).reshape( dimensions, 1 )
        scatter_matrix += d.dot( d.T )
    scatter_matrix /= nSamples - 1
    
    # Get the eigenvalues
    eig_val, eig_vec = numpy.linalg.eig(scatter_matrix)
    
    pairs = [( numpy.abs(eig_val[i]), eig_vec[i] ) for i in range(dimensions) ]
    pairs.sort()
    pairs.reverse()
    v = numpy.zeros(dimensions)
    w = []
    for i in range(dimensions):
        v[i] = pairs[i][0]
        w.append(pairs[i][1])
    explained_variance_ratio = v / v.sum()
    return (v, w, explained_variance_ratio)

## Function to plot the speed, acceleration and direction values

In [None]:
def plotSpeedAccelerationDirection( dataReader, driverId, tripId ):

    values = dataReader.speedAccelerationDirectionValues(driverId, tripId)

    plt.figure(figsize=(10,10), facecolor='lightblue')
    ax = plt.subplot(331)
    ax.plot( numpy.arange(0,len(values) ), values[0:,0], 'g-' )
    ax.set_ylabel('Speed $m/s$')
    ax.grid(True)

    ax = plt.subplot(332)
    ax.plot( numpy.arange(0,len(values) ), values[0:,1], 'b-' )
    ax.set_ylabel('Acceleration $m/s^2$')
    ax.grid(True)

    ax = plt.subplot(333)
    ax.plot( numpy.arange(0,len(values) ), values[0:,2], 'r-' )
    ax.set_ylabel('Direction $rad$')
    ax.grid(True)

    ax = plt.subplot(334)
    ax.plot( values[0:,0], values[0:,1], 'g.', alpha=0.3 )
    ax.set_xlabel('Speed $m/s$')
    ax.set_ylabel('Acceleration $m/s^2$')
    ax.grid(True)
    slope, intercept, r_value, p_value, std_err =scipy.stats.linregress( values[0:,0], values[0:,1])
    r2 = r_value**2
    xticks = ax.xaxis.get_majorticklocs()
    yticks = ax.yaxis.get_majorticklocs()
    xorig = xticks[0] + 0.2 * (xticks[-1] - xticks[0])
    yorig = yticks[0] + 0.8 * (yticks[-1] - yticks[0])
    ax.text(xorig, yorig,'$R^2$ : ' + str(numpy.around(r2,3)), ha='left')
    ax.plot( xticks, xticks * slope + intercept, 'g-')

    ax = plt.subplot(335)
    ax.plot( values[0:,0], values[0:,2], 'b.', alpha=0.3 )
    ax.set_xlabel('Speed $m/s$')
    ax.set_ylabel('Direction $rad$')
    ax.grid(True)
    slope, intercept, r_value, p_value, std_err =scipy.stats.linregress( values[0:,0], values[0:,2])
    r2 = r_value**2
    xticks = ax.xaxis.get_majorticklocs()
    yticks = ax.yaxis.get_majorticklocs()
    xorig = xticks[0] + 0.2 * (xticks[-1] - xticks[0])
    yorig = yticks[0] + 0.8 * (yticks[-1] - yticks[0])
    ax.text(xorig, yorig,'$R^2$ : ' + str(numpy.around(r2,3)), ha='left')
    ax.plot( xticks, xticks * slope + intercept, 'b-')

    ax = plt.subplot(336)
    ax.plot( values[0:,1], values[0:,2], 'r.', alpha=0.3 )
    ax.set_xlabel('Acceleration $m/s^2$')
    ax.set_ylabel('Direction $rad$')
    ax.grid(True)
    slope, intercept, r_value, p_value, std_err =scipy.stats.linregress( values[0:,1], values[0:,2])
    r2 = r_value**2
    xticks = ax.xaxis.get_majorticklocs()
    yticks = ax.yaxis.get_majorticklocs()
    xorig = xticks[0] + 0.2 * (xticks[-1] - xticks[0])
    yorig = yticks[0] + 0.8 * (yticks[-1] - yticks[0])
    ax.text(xorig, yorig,'$R^2$ : ' + str(numpy.around(r2,3)), ha='left')
    ax.plot( xticks, xticks * slope + intercept, 'r-')

    ax = plt.subplot(337)
    ax.hist( values[0:,0], 30, normed=True, facecolor='green' )
    ax.set_xlabel('Speed $m/s$')
    ax.grid(True)

    ax = plt.subplot(338)
    ax.hist( values[0:,1], 30, normed=True, facecolor='blue' )
    ax.set_xlabel('Acceleration $m/s^2$')
    ax.grid(True)

    ax = plt.subplot(339)
    ax.hist( values[0:,2], 30, normed=True, facecolor='red' )
    ax.set_xlabel('Direction $rad$')
    ax.grid(True)
    
    return

## Main script

Set up matplotlib and other packages

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats
import sklearn.decomposition

Create the connection to the server

In [None]:
dataReader = DataReader('cpptopythonpipe','pythontocpppipe')
driverIds = dataReader.driverList()

In [None]:
(metricData, variableNames) = dataReader.allTripMetrics()

In [None]:
driverId = 1

if driverId not in driverIds:
    raise BaseException("Invalid driver id")
(driverMetrics, variableNames) = dataReader.driverTripMetrics(driverId)

In [None]:
# Plot the trip metrics

numberOfDriverBins = 20
numberOfBackgroundBins = 150
numberOfMetrics = len(variableNames)
numberOfImageColumns = 4
numberOfImageRows = int( numpy.ceil(numberOfMetrics / numberOfImageColumns ))
plt.figure( figsize = (numberOfImageColumns * 4, numberOfImageRows * 4) )
for i in range(numberOfMetrics):
    ax = plt.subplot( numberOfImageRows, numberOfImageColumns, i + 1 )
    generalData = numpy.sort(metricData[ ~ numpy.isnan( metricData[0:,i] ) ][0:,i])
    generalData = generalData[0.0025*len(generalData) : 0.9975*len(generalData)]
    (v,b,o) = plt.hist(generalData, bins=numberOfBackgroundBins, normed=True, alpha=0.3)
    
    driverMetricData = driverMetrics[ ~ numpy.isnan( driverMetrics[0:,i] ) ][0:,i]
    (v,b,o) = plt.hist(driverMetricData,
                       bins=b[0] + numpy.arange(numberOfDriverBins+1)*(b[-1]-b[0])/numberOfDriverBins,
                 normed=True, alpha=0.3)
    t = ax.set_title( variableNames[i] )

In [None]:
# Perform PCA analysis after cleaning and normalising
normalisedData = numpy.apply_along_axis(normaliseData,0,metricData)
normalisedData=normalisedData[~numpy.any( numpy.isnan(normalisedData), axis=1 )][0:,2:]
numberOfPrincipalComponents = 10
pca = sklearn.decomposition.PCA(n_components = numberOfPrincipalComponents )
res = pca.fit(normalisedData)

# Transform the data
transformedData = pca.transform(normalisedData)
normalisedDataD = numpy.apply_along_axis(normaliseData,0,driverMetrics)
normalisedDataD=normalisedDataD[~numpy.any( numpy.isnan(normalisedDataD), axis=1 )][0:,2:]
transformedDataD = pca.transform(normalisedDataD)

In [None]:
plt.figure( figsize = (numberOfImageColumns * 4, numberOfImageRows * 4), facecolor='lightblue')
for i in range(numberOfPrincipalComponents):
        ax = plt.subplot( numberOfImageRows, numberOfImageColumns, i + 1 )
        h = plt.hist(transformedData[0:,i], numberOfBackgroundBins, normed=True)
        h = plt.hist(transformedDataD[0:,i], numberOfDriverBins, normed=True)        


In [None]:
len(transformedData)

In [None]:
driverId=2811
tripId=115
rawData=dataReader.rawData(driverId,tripId)
segmentData = dataReader.segmentData(driverId,tripId)
plotSegmentData( [rawData,])
plotSegmentData(segmentData)

In [None]:
# Speed, Acceleration, direction histograms
driverId = 1
tripId = 2
plotSpeedAccelerationDirection( dataReader, driverId, tripId )

In [None]:
dataReader.exit()