In [1]:
import pandas as pd
import numpy as np
import csv
import os
from datetime import datetime
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [2]:
#Activate the first option if whole column width is necessary and the second to make all columns visible
#pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

## I. Import the transformed datasets

In [3]:
#Get the three reports and store them into dataframes. Check what columns are included in the file
consolidate = pd.read_csv('rci_allfactors_consolidated.csv', index_col = 0)
base = pd.read_csv('Det_curve_set2.csv')

print(consolidate.columns)
print(base.columns)

Index(['feature_id', 'survey_feat_end', 'xsp_name', 'mean', 'mean.1', 'mean.2',
       'mean.3', 'mean.4', 'mean.5', 'mean.6', 'mean.7', 'mean.8', 'mean.9',
       'mean.10', 'mean.11', 'mean.12', 'mean.13', 'mean.14', 'max', 'max.1',
       'max.2', 'max.3', 'max.4', 'max.5', 'max.6', 'max.7', 'max.8', 'max.9',
       'max.10', 'max.11', 'max.12', 'max.13', 'max.14', 'std', 'std.1',
       'std.2', 'std.3', 'std.4', 'std.5', 'std.6', 'std.7', 'std.8', 'std.9',
       'std.10', 'std.11', 'std.12', 'std.13', 'std.14'],
      dtype='object')
Index(['Unnamed: 0', 'feature_id', 'FinancialY', 'sub_obs_val_max',
       'sub_obs_val_std', 'Carriageway Scheme', 'Drainage', 'Drainage CAT 2',
       'Localised Patching', 'Long Term Structural Maintenance',
       'Major Patching', 'Microasphalt', 'Reconstruction', 'Recycling',
       'Super Cat 2', 'Surface Dressing', 'Surface Inlay', 'Surface Overlay',
       'Surface Preservative', 'Thin Surfacing', 'Years_since', 'CCD1', 'CCD3',
       'CCD4'

### I. Cleaning the Consolidated set 

In [4]:
#Keep only Med columns
consolidate.drop(columns=['max', 'max.1', 'max.2', 'max.3', 'max.4', 'max.5',
       'max.6', 'max.7', 'max.8', 'max.9', 'max.10', 'max.11', 'max.12',
       'max.13', 'max.14', 'std', 'std.1', 'std.2', 'std.3', 'std.4', 'std.5',
       'std.6', 'std.7', 'std.8', 'std.9', 'std.10', 'std.11', 'std.12',
       'std.13', 'std.14'], inplace = True)

In [7]:
consolidate.columns

Index(['feature_id', 'survey_feat_end', 'xsp_name', 'mean', 'mean.1', 'mean.2',
       'mean.3', 'mean.4', 'mean.5', 'mean.6', 'mean.7', 'mean.8', 'mean.9',
       'mean.10', 'mean.11', 'mean.12', 'mean.13', 'mean.14'],
      dtype='object')

In [8]:
#Edit columns and clean rows that match the header:
consolidate.columns = ['feature_id', 'survey_feat_end', 'xsp_name','LCRV', 'LCTM', 'LEDR','LFAL', 'LGRD',
                       'LLRT', 'LLTD', 'LLTM','LLTX', 'LRRT', 'LRTM', 'LRTC','LRTV', 'LV10','LV3']
consolidate = consolidate.loc[consolidate['feature_id'] !='feature_id']
consolidate = consolidate.loc[~consolidate['feature_id'].isnull()]

In [9]:
#Replace Feature IDs that contain UKP 
consolidate.reset_index(drop = True, inplace = True)

In [10]:
#Remove "UKP/" from the Feature ID
consolidate['feature_id'] = consolidate['feature_id'].map(lambda x: x.lstrip('UKP_'))
consolidate['feature_id'] = consolidate['feature_id'].map(lambda x: x.lstrip('UKP/'))
consolidate['feature_id'] = consolidate['feature_id'].map(lambda x: x.replace('_','/'))

In [11]:
#Extract the Financial Year from the Survey Feature End
consolidate['FinancialY'] = pd.to_datetime(consolidate.survey_feat_end).dt.to_period('A-MAR')

In [12]:
consolidate.to_csv(r'C:\Users\J FernandezGomez\Jupyter Notebooks\7_SecondPhase\RCI_Scanner\factors_clean.csv')