# Data pre-processing for dublin-bus datasets

In [1]:
import os
import sys
import pandas as pd
import progressbar

## Read data into a dataframe

In [2]:
header = ['Timestamp','Line_ID','Direction','Journey_Pattern_ID','Time_Frame','Vehicle_Journey_ID', \
         'Operator','Congestion','Lon','Lat','Delay','Block_ID','Vehicle_ID','Stop_ID','At_Stop']
data_dir = '../dublin_data/'
data_file = data_dir + 'siri.20130101.csv'
df = pd.read_csv(data_file, header=None, names=header)

In [3]:
items = os.listdir(data_dir)
items = [item for item in items if (item[:4]=='siri')]
# print (items)

In [4]:
# print df (first 10 rows)
df[0:100]

Unnamed: 0,Timestamp,Line_ID,Direction,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Operator,Congestion,Lon,Lat,Delay,Block_ID,Vehicle_ID,Stop_ID,At_Stop
0,1356998403000000,747.0,0,07470001,2012-12-31,3493,SL,0,-6.236852,53.425327,-709,747006,40040,7411.0,0
1,1356998405000000,27.0,0,,2012-12-31,3883,RD,0,-6.233417,53.342232,0,27017,33521,395.0,0
2,1356998407000000,40.0,0,,2012-12-31,2226,HN,0,-6.278250,53.416683,0,40206,33142,6071.0,0
3,1356998407000000,7.0,0,00071003,2012-12-31,6106,D1,0,-6.231633,53.317768,0,7019,43004,3222.0,1
4,1356998411000000,747.0,0,07471001,2012-12-31,3531,SL,0,-6.254617,53.355484,-454,747007,40039,1445.0,0
5,1356998411000000,56.0,0,056A1001,2012-12-31,1830,RD,0,-6.233183,53.342201,0,56001,33488,2379.0,0
6,1356998417000000,25.0,0,025A0001,2012-12-31,2866,CD,0,-6.296867,53.347500,0,25007,33604,4604.0,0
7,1356998423000000,747.0,0,07470001,2012-12-31,3493,SL,0,-6.238668,53.425789,-687,747006,40040,7411.0,0
8,1356998425000000,27.0,0,,2012-12-31,3883,RD,0,-6.233400,53.342232,0,27017,33521,395.0,0
9,1356998427000000,4.0,0,,2012-12-31,4243,HN,0,-6.279000,53.416683,0,4001,43043,7226.0,0


## Main functions

In [5]:
# output a dataframe to specific file
def output(df,outfile):
    df.to_csv(outfile, index=False)

# keep columns by column names
def keep_cols(df,cols):
    drops = header[:]
    [drops.remove(i) for i in cols]
    ndf = df.drop(drops, axis=1)
    return ndf

# keep rows by condition
def keep_rows(df,col,satisfy):
    ndf = df[satisfy(df,col)]
    return ndf

In [6]:
# Return true if col value equal or larger then threshold
def larger(df,col,threshold=1):
    return df[col] >= threshold

## Tests

In [7]:
# test keep_column
cols = ['Timestamp','Line_ID','Vehicle_ID','Lon','Lat']
ndf = keep_cols(df,cols)
nndf = ndf[~ndf.Line_ID.isnull()]
vehicle_ids = set(nndf['Vehicle_ID'].values)

In [8]:
print (ndf.shape)
print (nndf.shape)

(776641, 5)
(776631, 5)


In [9]:
ldf = []
for line_id in vehicle_ids:
    sndf = ndf[ndf.Vehicle_ID == line_id]
    lsndf = sndf.copy()
    sndf.drop(sndf.index[-1],inplace=True)
    lsndf.drop(lsndf.index[0],inplace=True)
    sndf['toLon'] = lsndf['Lon'].values
    sndf['toLat'] = lsndf['Lat'].values
    ldf.append(sndf)
odf = pd.concat(ldf)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
odf

Unnamed: 0,Timestamp,Line_ID,Lon,Lat,Vehicle_ID,toLon,toLat
147667,1357041547000000,7.0,-6.232283,53.319000,43008,-6.232283,53.319000
148285,1357041594000000,7.0,-6.232283,53.319000,43008,-6.226750,53.315498
148576,1357041608000000,7.0,-6.226750,53.315498,43008,-6.223000,53.313900
149135,1357041649000000,7.0,-6.223000,53.313900,43008,-6.223000,53.313900
149233,1357041655000000,7.0,-6.223000,53.313900,43008,-6.223000,53.313900
149430,1357041667000000,7.0,-6.223000,53.313900,43008,-6.219767,53.310600
149723,1357041688000000,7.0,-6.219767,53.310600,43008,-6.219767,53.310600
149985,1357041708000000,7.0,-6.219767,53.310600,43008,-6.219767,53.310600
150095,1357041712000000,7.0,-6.219767,53.310600,43008,-6.213300,53.306999
150351,1357041728000000,7.0,-6.213300,53.306999,43008,-6.213300,53.306999
