In [2]:
import pandas as pd
import numpy as np
import csv
import os
from datetime import datetime
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [3]:
#Activate the first option if whole column width is necessary and the second to make all columns visible
#pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

## I. Import the transformed datasets

In [4]:
#Import the raw RCI score
rci = pd.read_csv('RCI_ZScore_v1.csv', index_col = 0)
base = pd.read_csv('Det_curve_set2.csv', index_col = 0)
factors = pd.read_csv('factors_clean.csv', index_col = 0)
print(rci.columns)
print(base.columns)

  mask |= (ar1 == a)


Index(['survey_obs_end', 'xsp_name', 'observe_type_name', 'survey_obs_value',
       'survey_feat_end', 'feature_id', 'survey_obs_notes', 'FinancialY',
       'ZScore'],
      dtype='object')
Index(['feature_id', 'FinancialY', 'sub_obs_val_max', 'sub_obs_val_std',
       'Carriageway Scheme', 'Drainage', 'Drainage CAT 2',
       'Localised Patching', 'Long Term Structural Maintenance',
       'Major Patching', 'Microasphalt', 'Reconstruction', 'Recycling',
       'Super Cat 2', 'Surface Dressing', 'Surface Inlay', 'Surface Overlay',
       'Surface Preservative', 'Thin Surfacing', 'Years_since', 'CCD1', 'CCD3',
       'CCD4', 'CFL', 'CFS1', 'CKD2', 'CKD3', 'CMD3', 'CMUD', 'CPH2', 'CPH3',
       'CPH4', 'CPH7', 'CSH1', 'DCF1', 'DCF2', 'DCF3', 'DPD1', 'DPD2', 'GUL',
       'NZU', 'PDAM', 'RICE', 'SFL', 'SPL', 'SPLE', 'Prev_RCI', 'SURF', 'BEDR',
       'Month', 'hierarchy', 'distance', 'road_type_hw', 'urban_rural',
       'surf_type', 'width', 'sub_obs_val_avg', 'weighted_av', 'perc85'],

### I. RCI Score Aggregation

In [5]:
#From the Raw set, separate per feature, financial year and lane
rci.loc[rci['survey_obs_value']==0].count()

survey_obs_end       718502
xsp_name             718502
observe_type_name    718502
survey_obs_value     718502
survey_feat_end      718502
feature_id           718502
survey_obs_notes     718502
FinancialY           718502
ZScore               718502
dtype: int64

In [6]:
rci['xsp_name'].value_counts()

Permanent Left Lane 1     1209289
Permanent Right Lane 1     712754
Name: xsp_name, dtype: int64

In [7]:
#Transform any RCI notes into Green, Red or Amber for easier processing
rci.loc[(rci.survey_obs_notes.str.contains('GREEN')),'survey_obs_notes']='GREEN'
rci.loc[(rci.survey_obs_notes.str.contains('AMBER')),'survey_obs_notes']='AMBER'
rci.loc[(rci.survey_obs_notes.str.contains('RED')),'survey_obs_notes']='RED'

In [8]:
#Aggregate RCI Score considering Feature ID, Financial Year and Lane
prueba2 = pd.pivot_table(rci, values=['survey_obs_value'], 
                       index=['feature_id','FinancialY','xsp_name'],
                    columns=['observe_type_name'], aggfunc = [np.mean,max,np.std])
rci_agg = prueba2.reset_index(level = ['feature_id', 'FinancialY','xsp_name'])
rci_agg.columns = ['feature_id', 'FinancialY','lane', 'avg_score_lane', 'sub_obs_val_max', 'sub_obs_val_std']
rci_agg.shape

(26766, 6)

In [9]:
rci_agg.loc[(rci_agg.lane.str.contains('Left')),'lane']='LEFT'
rci_agg.loc[(rci_agg.lane.str.contains('Right')),'lane']='RIGHT'

In [10]:
rci_agg.to_csv(r'C:\Users\J FernandezGomez\Jupyter Notebooks\7_SecondPhase\RCI_Scanner\rci_aggegated.csv')

In [37]:
#CONSIDER REMOVING ELEMENTS THAT ARE ABOVE A CERTAIN THRESHOLD
rci.loc[rci['ZScore']>5].count()

survey_obs_end       2415
xsp_name             2415
observe_type_name    2415
survey_obs_value     2415
survey_feat_end      2415
feature_id           2415
survey_obs_notes     2415
FinancialY           2415
ZScore               2415
dtype: int64

In [38]:
rci.loc[rci['feature_id']=='A41/002']

Unnamed: 0,survey_obs_end,xsp_name,observe_type_name,survey_obs_value,survey_feat_end,feature_id,survey_obs_notes,FinancialY,ZScore
0,4.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
1,14.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
2,24.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
3,34.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
4,44.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
5,54.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
6,64.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
7,74.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
8,84.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071
9,94.0,Permanent Left Lane 1,SCANNER Index - Bin Identifier,0.0,13/09/2008 23:59,A41/002,GREEN,2009,-0.149071


In [39]:
rci_agg.head(500)

Unnamed: 0,feature_id,FinancialY,lane,avg_score_lane,sub_obs_val_max,sub_obs_val_std
0,A10/104,2009,LEFT,3.940571,33.86,9.027565
1,A10/104,2009,RIGHT,8.481471,77.92,19.711916
2,A10/104,2010,LEFT,7.363529,80.0,15.936653
3,A10/104,2010,RIGHT,17.661429,96.67,27.412318
4,A10/104,2011,LEFT,2.392424,52.79,9.425704
5,A10/104,2011,RIGHT,9.590909,96.08,22.166926
6,A10/104,2012,LEFT,2.563235,27.62,6.486717
7,A10/104,2012,RIGHT,10.804706,80.0,23.167025
8,A10/104,2013,LEFT,9.592353,79.88,19.7871
9,A10/104,2013,RIGHT,18.178485,120.0,29.630822


#### II. Merge the sets

In [40]:
#Merge the RCI score per lane with the general factors.
#THE LIMITATION IS THAT EVENTS AND ALL OTHERS WILL NOT BE DIVIDED.

In [43]:
#Drop Max and standard deviation values for merging:
rci_agg.drop(columns = ['sub_obs_val_max','sub_obs_val_std'], inplace = True)

In [44]:
#Split the RCI dataset into left and right
left = rci_agg.loc[rci_agg['lane']=='LEFT']
right = rci_agg.loc[rci_agg['lane']=='RIGHT']

In [45]:
left.head()

Unnamed: 0,feature_id,FinancialY,lane,avg_score_lane
0,A10/104,2009,LEFT,3.940571
2,A10/104,2010,LEFT,7.363529
4,A10/104,2011,LEFT,2.392424
6,A10/104,2012,LEFT,2.563235
8,A10/104,2013,LEFT,9.592353


In [46]:
#Add the RCI score per lane to the general set. Obtaining one set per side:
lscore = base.merge(left,'left',on=['feature_id', 'FinancialY'],left_index = True)
rscore = base.merge(right,'left',on=['feature_id', 'FinancialY'],left_index = True)
lscore.reset_index(drop= True, inplace = True)
rscore.reset_index(drop= True, inplace = True)

In [51]:
lscore['avg_score_lane'].isnull().sum()
rscore['avg_score_lane'].isnull().sum()

12974

In [54]:
lscore.head(500)

Unnamed: 0,feature_id,FinancialY,sub_obs_val_max,sub_obs_val_std,Carriageway Scheme,Drainage,Drainage CAT 2,Localised Patching,Long Term Structural Maintenance,Major Patching,Microasphalt,Reconstruction,Recycling,Super Cat 2,Surface Dressing,Surface Inlay,Surface Overlay,Surface Preservative,Thin Surfacing,Years_since,CCD1,CCD3,CCD4,CFL,CFS1,CKD2,CKD3,CMD3,CMUD,CPH2,CPH3,CPH4,CPH7,CSH1,DCF1,DCF2,DCF3,DPD1,DPD2,GUL,NZU,PDAM,RICE,SFL,SPL,SPLE,Prev_RCI,SURF,BEDR,Month,hierarchy,distance,road_type_hw,urban_rural,surf_type,width,sub_obs_val_avg,weighted_av,perc85,lane,avg_score_lane
0,B486/20,2009,50.0,17.100354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,27.968,24.046409,47.0845,,
1,B486/20,2010,128.7,30.767874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,82.246,72.053646,108.7225,LEFT,82.246
2,B486/20,2011,50.0,14.08133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.7,GYCK,GYCK,3.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,29.785455,20.360878,41.665,,
3,B486/20,2012,104.18,28.18549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,43.430909,42.949916,66.27,,
4,B486/20,2013,54.74,14.952653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,104.18,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,14.207273,16.742961,19.1,LEFT,14.207273
5,B486/20,2014,50.0,17.027742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.74,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,26.135,22.786151,45.9155,,
6,B486/20,2015,88.51,25.32863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,35.518182,40.597503,57.215,LEFT,35.518182
7,B486/20,2016,37.3,11.32034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.51,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,17.731,12.056656,23.8345,,
8,B486/20,2017,49.99,15.808226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.3,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,4.999,14.997,0.0,LEFT,4.999
9,B486/20,2018,91.76,26.588759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.99,GYCK,GYCK,0.0,SD Secondary Distributor,98.0,Single 2-Lane Carriageway,Urban,BITM,8.34,27.252,28.90189,37.557,,


In [None]:
#Fill null values with values from the previous year
rci_stack[2010].fillna(rci_stack[2009], inplace=True)
rci_stack[2011].fillna(rci_stack[2010], inplace=True)
rci_stack[2012].fillna(rci_stack[2011], inplace=True)
rci_stack[2013].fillna(rci_stack[2012], inplace=True)
rci_stack[2014].fillna(rci_stack[2013], inplace=True)
rci_stack[2015].fillna(rci_stack[2014], inplace=True)
rci_stack[2016].fillna(rci_stack[2015], inplace=True)
rci_stack[2017].fillna(rci_stack[2016], inplace=True)
rci_stack[2018].fillna(rci_stack[2017], inplace=True)
rci_stack[2019].fillna(rci_stack[2018], inplace=True)

In [50]:
#initial.loc[initial['feature_id']=='A4251/270']
initial.head(100)

NameError: name 'initial' is not defined

In [75]:
initial['LCRV'].isnull().sum()
#initial.shape

467

In [76]:
initial.loc[initial['LCRV'].isnull()]

Unnamed: 0,feature_id,FinancialY,sub_obs_val_max,sub_obs_val_std,Carriageway Scheme,Drainage,Drainage CAT 2,Localised Patching,Long Term Structural Maintenance,Major Patching,Microasphalt,Reconstruction,Recycling,Super Cat 2,Surface Dressing,Surface Inlay,Surface Overlay,Surface Preservative,Thin Surfacing,Years_since,CCD1,CCD3,CCD4,CFL,CFS1,CKD2,CKD3,CMD3,CMUD,CPH2,CPH3,CPH4,CPH7,CSH1,DCF1,DCF2,DCF3,DPD1,DPD2,GUL,NZU,PDAM,RICE,SFL,SPL,SPLE,Prev_RCI,SURF,BEDR,Month,hierarchy,distance,road_type_hw,urban_rural,surf_type,width,sub_obs_val_avg,weighted_av,perc85,survey_feat_end,LCRV,LCTM,LEDR,LFAL,LGRD,LLRT,LLTD,LLTM,LLTX,LRRT,LRTM,LRTC,LRTV,LV10,LV3
477,C153/10,2011,141.02,26.882215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.51,WHCK,WHCK,0.0,L2 Local Access,561.0,Single 2-Lane Carriageway,Rural,BITM,11.71,57.033889,38.376807,67.8605,,,,,,,,,,,,,,,,
487,C74/150,2011,201.08,37.784937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,161.38,GSG,LMBE,5.0,L1 Local Distributor,1151.0,Single 2-Lane Carriageway,Rural,BITM,5.69,28.125526,32.852215,66.787,,,,,,,,,,,,,,,,
547,C54/38,2011,185.51,34.194799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,149.255,THAM,THAM,0.0,L1 Local Distributor,620.0,Single 2-Lane Carriageway,Urban,BITM,8.89,25.284032,21.902637,59.85,,,,,,,,,,,,,,,,
577,C51/50,2011,168.0,34.159969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GSG,WHCK,0.0,L2 Local Access,1769.0,Single 2-Lane Carriageway,Rural,BITM,6.02,31.9432,34.009848,69.72,,,,,,,,,,,,,,,,
597,C46/20,2011,146.4,31.856354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.2,WHCK,WHCK,0.0,L1 Local Distributor,1141.0,Single 2-Lane Carriageway,Rural,BITM,4.85,29.210435,27.695181,59.76,,,,,,,,,,,,,,,,
607,C15/130,2011,113.6,34.585017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TILL,WHCK,0.0,L2 Local Access,414.0,Single 2-Lane Carriageway,Rural,BITM,4.55,28.476098,36.512857,73.33,,,,,,,,,,,,,,,,
677,C50/35,2011,141.0,36.060736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173.0,GSG,WHCK,0.0,L2 Local Access,123.0,Single 2-Lane Carriageway,Rural,BITM,7.17,63.126154,63.829558,95.88,,,,,,,,,,,,,,,,
727,C165/155,2011,37.4,6.686618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.7,WHCK,WHCK,0.0,L2 Local Access,1323.0,Single 2-Lane Carriageway,Urban,BITM,9.21,2.424722,2.991134,1.851,,,,,,,,,,,,,,,,
747,C46/210,2011,208.02,49.563116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,220.055,TILL,WHCK,0.0,L1 Local Distributor,1739.0,Single 2-Lane Carriageway,Rural,BITM,5.6,59.8824,61.904114,116.36,,,,,,,,,,,,,,,,
777,C165/10,2011,80.0,10.318741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,WHCK,WHCK,0.0,L1 Local Distributor,1081.0,Single 2-Lane Carriageway,Rural,BITM,8.59,2.051927,0.185691,0.0,,,,,,,,,,,,,,,,
