In [326]:
# import libraries
import pandas as pd
import html5lib
import matplotlib.pyplot as plt
import numpy as np
import logging

#define the relative values of the trucks according to economic value equation
Heavy_value = 30
Medium_value = 25
Small_value = 16

In [327]:
#function that takes the roadname as an argument and returns a panda dataframe with the traffic data for that road
def htm_import(roadname):
        
        #make a temporary dataframe that will be returned by the function
        df_current_road = pd.DataFrame()
        
        #get filename and open the .htm file
        filename = "RMMS/{}.traffic.htm".format(roadname)
        f =  open(filename, "rb")
        
        #find all the dataframes that contain the string of the roadname
        list_road = pd.read_html(f, skiprows = {0,0,5},header  = 1, match = roadname) #the third data frame has 5 skip rows
        df_current_road = list_road[2] #third table in the file is the data frame with the traffic data
        f.close()

        #subset to remove all columns after the 25th column. 
        #The 26th column is a repeat of total AADT data in 25th column and after that there are a number of empty columns 
        #as a result of parsing from .htm that should also be removed
        column_remove =list(df_current_road.columns.values)[25:]
        df_current_road = df_current_road.drop(labels=column_remove, axis = 1)

        #rename the columns
        df_current_road.columns = ["Link no","Link Name","Start LRP","Start Offset","Start Chainage","End LRP","End Offset","End Chainage","Link Length", "Heavy Truck","Medium Truck","Small Truck","Large Bus","Medium Bus","Micro Bus","Utility","Car","Auto Rickshaw","Motor Cycle","Bi-Cycle","Cycle Rickshaw","Cart","Motorized Total","Non Motorized Total","Total Traffic"]       

        #include the roadname in the dataframe
        df_current_road['Road name'] = roadname
        return(df_current_road)



In [329]:
#involves calculating the economic value of the traffic on each 'link', and processing duplicates for L and R on a link.
#the average vaule of L and R traffc is used, and one of them is removed
#also removes links for which there is no data at all, and returns a list of the links that have been removed
def process_traffic(df):

    end_index = len(df)
    drop_indices = [] #list containing indices of duplictates to be removed
    for i in range(end_index):
        
        #check for missing data in the link
        if df.iloc[i]['Heavy Truck'] == 'NS':
            #all the rows with NS data (i.e missing data) have NS values in this column
            #remove the 'bad data row' - but maybe change to NA instead?
            drop_indices.append(i)
            #report deletion to log
            logging.warning('%s link deleted due to no data',df.iloc[i]['Link no'] )
            
        elif type(df.iloc[i]['Heavy Truck']) != str: #no missing values so continue as normal...

            #calculate the "economic value of traffic" according to formula
            df.iloc[i, df.columns.get_loc("Economic Traffic")] = df.iloc[i]['Heavy Truck']*Heavy_value+df.iloc[i]['Medium Truck']*Medium_value + df.iloc[i]['Small Truck']*Small_value


            #EXTEND THIS CONDITION TO MAKE IT MORE ACCURATE (MAYBE NOT NECESSARY): same chainage/name etc also
            if 'L' in df.iloc[i]['Link no'] and 'R' in df.iloc[i+1]['Link no']:
                
                if not (df.iloc[i+1]['Heavy Truck'] == 'NS'): #cannot average with next row's value if the next row is corrupt data
                    
                    
                    #use average of all the traffic data from L and R 
                    df.iloc[i, df.columns.get_loc('Heavy Truck')] = (df.iloc[i]['Heavy Truck']+df.iloc[i+1]['Heavy Truck'])/2
                    df.iloc[i, df.columns.get_loc('Medium Truck')] = (df.iloc[i]['Medium Truck']+df.iloc[i+1]['Medium Truck'])/2
                    df.iloc[i, df.columns.get_loc('Small Truck')] = (df.iloc[i]['Small Truck']+df.iloc[i+1]['Small Truck'])/2
                    
                    df.iloc[i, df.columns.get_loc('Large Bus')] = (df.iloc[i]['Large Bus']+df.iloc[i+1]['Large Bus'])/2
                    df.iloc[i, df.columns.get_loc('Medium Bus')] = (df.iloc[i]['Medium Bus']+df.iloc[i+1]['Medium Bus'])/2
                    df.iloc[i, df.columns.get_loc('Micro Bus')] = (df.iloc[i]['Micro Bus']+df.iloc[i+1]['Micro Bus'])/2
                    
                    df.iloc[i, df.columns.get_loc('Utility')] = (df.iloc[i]['Utility']+df.iloc[i+1]['Utility'])/2
                    
                    df.iloc[i, df.columns.get_loc('Car')] = (df.iloc[i]['Car']+df.iloc[i+1]['Car'])/2
                    df.iloc[i, df.columns.get_loc('Auto Rickshaw')] = (df.iloc[i]['Auto Rickshaw']+df.iloc[i+1]['Auto Rickshaw'])/2
                    df.iloc[i, df.columns.get_loc('Motor Cycle')] = (df.iloc[i]['Motor Cycle']+df.iloc[i+1]['Motor Cycle'])/2
                    df.iloc[i, df.columns.get_loc('Bi-Cycle')] = (df.iloc[i]['Bi-Cycle']+df.iloc[i+1]['Bi-Cycle'])/2        
                    df.iloc[i, df.columns.get_loc('Cycle Rickshaw')] = (df.iloc[i]['Cycle Rickshaw']+df.iloc[i+1]['Cycle Rickshaw'])/2
                    df.iloc[i, df.columns.get_loc('Cart')] = (df.iloc[i]['Cart']+df.iloc[i+1]['Cart'])/2
                    
                    df.iloc[i, df.columns.get_loc('Motorized Total')] = (df.iloc[i]['Motorized Total']+df.iloc[i+1]['Motorized Total'])/2        
                    df.iloc[i, df.columns.get_loc('Non Motorized Total')] = (df.iloc[i]['Non Motorized Total']+df.iloc[i+1]['Non Motorized Total'])/2        
                    df.iloc[i, df.columns.get_loc('Total Traffic')] = (df.iloc[i]['Total Traffic']+df.iloc[i+1]['Total Traffic'])/2        

                    #remove the duplicate
                    drop_indices.append(i+1)
                    logging.warning('%s link deleted due to duplicate',df.iloc[i]['Link no'] )

                    #rename the Link no (e.g. N1-1R is removed and N1-1L is renamed N1-1)
                    df.iloc[i, df.columns.get_loc('Link no')] = df.iloc[i]['Link no'].replace('L','')

    for i in range(end_index):      
        #calculate the "economic value of traffic" according to formula
        df.iloc[i, df.columns.get_loc("Economic Traffic")] = df.iloc[i]['Heavy Truck']*Heavy_value+df.iloc[i]['Medium Truck']*Medium_value + df.iloc[i]['Small Truck']*Small_value

    df = df.drop(labels = drop_indices)

    return(df)
    #question: do we also want to rename any single R or L link numbers? Or is this okay? I think it is more informative this way...?

In [328]:
#function to return the 'average number of lanes over a segment of road'...weighted by the length for which the road
#has that many lanes
#input arguments: start chainage of road segment (km), end chainage of road segment (km), segment length (km), data frame of lane/width dtaa for that road
#output arguments: 'average number of lanes over that segment of road'
def average_lanes (a,b,segment_length, df):
    weighted_lanes = 0.0 
    
    for i in range(len(df)):
        start = df.iloc[i]['startChainage']
        end = df.iloc[i]['endChainage']
        lanes = df.iloc[i]['nrLanes']
        if not((a > end) or (b < start)):
            
            if a <= start and b >= end:
                #entire part of the lane segment is on the road segment
                weighted_lanes += (end-start)*lanes

            elif a >= start and b>= end:
                #end part of the lane segment is on the road segment
                weighted_lanes += (end-a)*lanes

            elif (a < start) and (b < end):
                #beginning part of the lane segment is on the road segment
                weighted_lanes += (b-start)*lanes

            elif (a >= start) and (b<= end):
                #entire road segment is within a lane segment
                weighted_lanes += (b-a)*lanes

        #else no part of that lane segment is on the road segment
    if weighted_lanes == 0.0:
        return (-1) #no lane data found that matched, -1 is the indicator of this error
    
    
        #finally, return the number of lanes, averaged by length of the total lanes
    return (weighted_lanes/segment_length)

In [409]:
#MAIN BLOCK OF CODE, PART 1: Road Segments

#create the output data frame with corresponding columns for output
df_traffic_allroads = pd.DataFrame(columns =["Road name","Link no","Link Name","Start LRP","Start Offset","Start Chainage","End LRP","End Offset","End Chainage","Link Length", "Economic Traffic","No Lanes","Heavy Truck","Medium Truck","Small Truck","Large Bus","Medium Bus","Micro Bus","Utility","Car","Auto Rickshaw","Motor Cycle","Bi-Cycle","Cycle Rickshaw","Cart","Motorized Total","Non Motorized Total","Total Traffic"])


#get list of all the roads 
roadnames = pd.read_csv('_roadnames_list.csv') # need to have roadnames list in same folder path 
roadnames_list = list(roadnames.columns.unique())
#
#
#
#JUST FOR TESTING N1 !!!!
#NB: works for N1, but need to clean NAs from other small roads before running on entire road list
#roadnames_list = ['N5']

#iterate through each road and append it to the main dataframe using defined function htm_import
for roadname in roadnames_list:
    df_traffic_allroads = df_traffic_allroads.append(htm_import(roadname), ignore_index = True, sort = False)


#call function to process the data from htm: ie. remove duplicates, and find the criticality of each segment
df_traffic_allroads = process_traffic(df_traffic_allroads)

#next section of code (for loop) calls the function to calculate the average number of lanes in each road segment
#iterate through each road segment on the database
road_name_previous = 'Na' #initialise for first comparison
for i in range(len(df_traffic_allroads)):
    
    road_name_current = df_traffic_allroads.iloc[i]['Road name']
    #check if segmnet of road is on a different road
    if (road_name_current != road_name_previous):
        #open new file
        filename = "RMMS/{}.widths.processed.txt".format(road_name_current)
        f =  open(filename, "rb")
        df_lanes = pd.read_csv(f, sep='\t', lineterminator='\n')
        df_lanes.columns = ['roadNo', 'roadId', 'startChainage', 'endChainage', 'width','nrLanes']

    a = df_traffic_allroads.iloc[i]['Start Chainage']
    b = df_traffic_allroads.iloc[i]['End Chainage']
    length = df_traffic_allroads.iloc[i]['Link Length']
    
    #calculate average value of lanes using function and update the data frame
    df_traffic_allroads.iloc[i, df_traffic_allroads.columns.get_loc('No Lanes')] = average_lanes(a,b,length,df_lanes)
    
    #pass into memory for next comparison
    road_name_previous = road_name_current

#call function that converts all traffic modes into traffic density for that mode
#df_traffic_allroads = traffic_to_density(df_traffic_allroads)
df = df_traffic_allroads
for i in range(len(df)):
    lanes = df.iloc[i]['No Lanes']

    if  lanes == -1 or  type(df.iloc[i, df.columns.get_loc('Medium Truck')]) == str :
    #mark as NA using numpy.nan
        df.iloc[i, df.columns.get_loc('Heavy Truck')] = np.nan
        df.iloc[i, df.columns.get_loc('Medium Truck')] = np.nan
        df.iloc[i, df.columns.get_loc('Small Truck')] = np.nan
                    
        df.iloc[i, df.columns.get_loc('Large Bus')] = np.nan
        df.iloc[i, df.columns.get_loc('Medium Bus')] = np.nan
        df.iloc[i, df.columns.get_loc('Micro Bus')] = np.nan
                    
        df.iloc[i, df.columns.get_loc('Utility')] = np.nan
                    
        df.iloc[i, df.columns.get_loc('Car')] = np.nan
        df.iloc[i, df.columns.get_loc('Auto Rickshaw')] =np.nan
        df.iloc[i, df.columns.get_loc('Motor Cycle')] = np.nan
        df.iloc[i, df.columns.get_loc('Bi-Cycle')] = np.nan    
        df.iloc[i, df.columns.get_loc('Cycle Rickshaw')] = np.nan
        df.iloc[i, df.columns.get_loc('Cart')] = np.nan
                    
        df.iloc[i, df.columns.get_loc('Motorized Total')] = np.nan     
        df.iloc[i, df.columns.get_loc('Non Motorized Total')] = np.nan      
        df.iloc[i, df.columns.get_loc('Total Traffic')] = np.nan
                           
    else:

        #convert from traffic to traffic density
        df.iloc[i, df.columns.get_loc('Heavy Truck')] = (df.iloc[i]['Heavy Truck'])/lanes
        df.iloc[i, df.columns.get_loc('Medium Truck')] = (df.iloc[i]['Medium Truck'])/lanes
        df.iloc[i, df.columns.get_loc('Small Truck')] = (df.iloc[i]['Small Truck'])/lanes
                    
        df.iloc[i, df.columns.get_loc('Large Bus')] = (df.iloc[i]['Large Bus'])/lanes
        df.iloc[i, df.columns.get_loc('Medium Bus')] = (df.iloc[i]['Medium Bus'])/lanes
        df.iloc[i, df.columns.get_loc('Micro Bus')] = (df.iloc[i]['Micro Bus'])/lanes
                    
        df.iloc[i, df.columns.get_loc('Utility')] = (df.iloc[i]['Utility'])/lanes
                    
        df.iloc[i, df.columns.get_loc('Car')] = (df.iloc[i]['Car'])/lanes
        df.iloc[i, df.columns.get_loc('Auto Rickshaw')] = (df.iloc[i]['Auto Rickshaw'])/lanes
        df.iloc[i, df.columns.get_loc('Motor Cycle')] = (df.iloc[i]['Motor Cycle'])/lanes
        df.iloc[i, df.columns.get_loc('Bi-Cycle')] = (df.iloc[i]['Bi-Cycle'])/lanes       
        df.iloc[i, df.columns.get_loc('Cycle Rickshaw')] = (df.iloc[i]['Cycle Rickshaw'])/lanes
        df.iloc[i, df.columns.get_loc('Cart')] = (df.iloc[i]['Cart'])/lanes
                    
        df.iloc[i, df.columns.get_loc('Motorized Total')] = (df.iloc[i]['Motorized Total'])/lanes     
        df.iloc[i, df.columns.get_loc('Non Motorized Total')] = (df.iloc[i]['Non Motorized Total'])/lanes      
        df.iloc[i, df.columns.get_loc('Total Traffic')] = (df.iloc[i]['Total Traffic'])/lanes        

   
    
#save output file
df.to_csv('data_traffic_road_segments.csv',sep= ',', header=True)








In [410]:
#MAIN BLOCK OF CODE, PART 2: Summary per Road

#this section of code agregates the data from the road segments, to the total road (for each road)

df1 =df #road segment data frame (input), indexed with i
df2 = pd.DataFrame(columns =["Road name","Economic Traffic","No Lanes","Heavy Truck","Medium Truck","Small Truck","Large Bus","Medium Bus","Micro Bus","Utility","Car","Auto Rickshaw","Motor Cycle","Bi-Cycle","Cycle Rickshaw","Cart","Motorized Total","Non Motorized Total","Total Traffic"], index = range(1000))
df2 = df2.fillna(0) #same data frame aggregated to road level (output), indexed with j
j = 0
nrSegments = 0
segment_length = 0

road_name_previous = 'Na' #initialise for first comparison
for i in range(len(df1)):

#iterating through segments
    segment_length = df1.iloc[i]['Link Length'] #record segment length each time

    road_name_current = df1.iloc[i]['Road name']
    #check if segmnet of road is on a different road i.e. the next road in the df
    if (road_name_current != road_name_previous):
        #divide data by number of segments in the road to get average
        
        
        nrSegments = 0 #reset number of segments for the road
        j = j+1 #update to next row in road_summary dataframe for next road
        df2.iloc[j, df.columns.get_loc('Road name')] = road_name_current
        #set the road name for the row in road_summary
    else:
        nrSegments += 1
    if type(df1.iloc[i]['Economic Traffic'] ) != str:
        #sum the data from this segment to the data already in the summary file for that roads row
        df2.iloc[j, df2.columns.get_loc('No Lanes')] += (df1.iloc[i]['No Lanes'])*segment_length      
        df2.iloc[j, df2.columns.get_loc('Economic Traffic')] += (df1.iloc[i]['Economic Traffic']) *segment_length
        #not sure whether economic traffic should be averaged this way - check definitions with Emma !!


        df2.iloc[j, df2.columns.get_loc('Heavy Truck')] += (df1.iloc[i]['Heavy Truck'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Medium Truck')] += (df1.iloc[i]['Medium Truck'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Small Truck')] += (df1.iloc[i]['Small Truck'])*segment_length

        df2.iloc[j, df2.columns.get_loc('Large Bus')] += (df1.iloc[i]['Large Bus'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Medium Bus')] += (df1.iloc[i]['Medium Bus'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Micro Bus')] += (df1.iloc[i]['Micro Bus'])*segment_length

        df2.iloc[j, df2.columns.get_loc('Utility')] += (df1.iloc[i]['Utility'])*segment_length
                    
        df2.iloc[j, df2.columns.get_loc('Car')] += (df1.iloc[i]['Car'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Auto Rickshaw')] += (df1.iloc[i]['Auto Rickshaw'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Motor Cycle')] += (df1.iloc[i]['Motor Cycle'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Bi-Cycle')] += (df1.iloc[i]['Bi-Cycle'])*segment_length      
        df2.iloc[j, df2.columns.get_loc('Cycle Rickshaw')] += (df1.iloc[i]['Cycle Rickshaw'])*segment_length
        df2.iloc[j, df2.columns.get_loc('Cart')] += (df1.iloc[i]['Cart'])*segment_length
                    
        df2.iloc[j, df2.columns.get_loc('Motorized Total')] += (df1.iloc[i]['Motorized Total'])*segment_length    
        df2.iloc[j, df2.columns.get_loc('Non Motorized Total')] += (df1.iloc[i]['Non Motorized Total'])*segment_length      
        df2.iloc[j, df2.columns.get_loc('Total Traffic')] += (df1.iloc[i]['Total Traffic'])*segment_length       

    
    #pass into memory for next comparison
    road_name_previous = road_name_current
    
    
#save output file
df2.to_csv('summary_data_traffic_per_road.csv',sep= ',', header=True)            