# Data pre-processing for dublin-bus datasets

In [55]:
import pandas as pd

## Read data into a dataframe

In [56]:
header = ['Timestamp','Line_ID','Direction','Journey_Pattern_ID','Time_Frame','Vehicle_Journey_ID', \
         'Operator','Congestion','Lon','Lat','Delay','Block_ID','Vehicle_ID','Stop_ID','At_Stop']
df = pd.read_csv('siri.20130101.csv', header=None, names=header)

In [57]:
# print df (first 10 rows)
df[0:10]

Unnamed: 0,Timestamp,Line_ID,Direction,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Operator,Congestion,Lon,Lat,Delay,Block_ID,Vehicle_ID,Stop_ID,At_Stop
0,1356998403000000,747.0,0,07470001,2012-12-31,3493,SL,0,-6.236852,53.425327,-709,747006,40040,7411.0,0
1,1356998405000000,27.0,0,,2012-12-31,3883,RD,0,-6.233417,53.342232,0,27017,33521,395.0,0
2,1356998407000000,40.0,0,,2012-12-31,2226,HN,0,-6.27825,53.416683,0,40206,33142,6071.0,0
3,1356998407000000,7.0,0,00071003,2012-12-31,6106,D1,0,-6.231633,53.317768,0,7019,43004,3222.0,1
4,1356998411000000,747.0,0,07471001,2012-12-31,3531,SL,0,-6.254617,53.355484,-454,747007,40039,1445.0,0
5,1356998411000000,56.0,0,056A1001,2012-12-31,1830,RD,0,-6.233183,53.342201,0,56001,33488,2379.0,0
6,1356998417000000,25.0,0,025A0001,2012-12-31,2866,CD,0,-6.296867,53.3475,0,25007,33604,4604.0,0
7,1356998423000000,747.0,0,07470001,2012-12-31,3493,SL,0,-6.238668,53.425789,-687,747006,40040,7411.0,0
8,1356998425000000,27.0,0,,2012-12-31,3883,RD,0,-6.2334,53.342232,0,27017,33521,395.0,0
9,1356998427000000,4.0,0,,2012-12-31,4243,HN,0,-6.279,53.416683,0,4001,43043,7226.0,0


## Main functions

In [58]:
# output a dataframe to specific file
def output(df,outfile):
    df.to_csv(outfile, index=False)

# keep columns by column names
def keep_cols(df,cols):
    drops = header[:]
    [drops.remove(i) for i in cols]
    ndf = df.drop(drops, axis=1)
    return ndf

# keep rows by condition
def keep_rows(df,col,satisfy):
    ndf = df[satisfy(df,col)]
    return ndf

In [59]:
# Return true if col value equal or larger then threshold
def larger(df,col,threshold=1):
    return df[col] >= threshold

## Tests

In [60]:
# test keep_column
cols = ['Congestion','Lon','Lat']
ndf = keep_cols(df,cols)

In [61]:
# print ndf
ndf[0:10]

Unnamed: 0,Congestion,Lon,Lat
0,0,-6.236852,53.425327
1,0,-6.233417,53.342232
2,0,-6.27825,53.416683
3,0,-6.231633,53.317768
4,0,-6.254617,53.355484
5,0,-6.233183,53.342201
6,0,-6.296867,53.3475
7,0,-6.238668,53.425789
8,0,-6.2334,53.342232
9,0,-6.279,53.416683


In [62]:
nndf = keep_rows(ndf,'Congestion',larger)

In [63]:
# print nndf
nndf[0:10]

Unnamed: 0,Congestion,Lon,Lat
7026,1,-6.229633,53.354637
7081,1,-6.229633,53.354637
7197,1,-6.229633,53.354637
7262,1,-6.229633,53.354637
7305,1,-6.229633,53.354637
7402,1,-6.229633,53.354637
7459,1,-6.229633,53.354637
7596,1,-6.229633,53.354637
7704,1,-6.229633,53.354637
8584,1,-6.257792,53.349144
