In [1]:
# Libraries
import pandas as pd
import numpy as np
import random
import plotly
import plotly.express as px
import warnings
import statsmodels.api as sm

from collections import Counter
from collections import defaultdict
from statsmodels.formula.api import ols
from scipy import stats
from plotly.subplots import make_subplots
import plotly.graph_objects as go

warnings.filterwarnings('ignore')

In [2]:
# Loading the dataset
NE = pd.read_csv("nebraska1992-2018.csv")
NE_train = pd.read_csv("nebraska1992-2011.csv")
NE_ground = pd.read_csv("nebraska2012-2018.csv")

In [3]:
# Grouping
def create_timeseries(dataframe):
    timeseries = defaultdict()
    for key, record in dataframe.groupby("structureNumber"):
        colDict = defaultdict()
        for col in record.columns:
            colDict[col] = list(record[col])
        timeseries[key] = colDict
    return timeseries

NE_ts = create_timeseries(NE)
NE_train_ts = create_timeseries(NE_train)
NE_ground_ts = create_timeseries(NE_ground)

In [4]:
# Define columns
columns = ['structureNumber',
           'year',
           'yearBuilt',
           'averageDailyTraffic',
           'designLoad',
           'numberOfSpansInMainUnit',
           'structureLength',
           'longitude',
           'latitude',
           'owner',
           #'structureType',
           'deckBDSScore',
           'substructureBDSScore',
           'superstructureBDSScore',
           'deckDeteriorationScore',
           'substructureDeteriorationScore',
           'superstructureDeteriorationScore']

listColumns = [ 
                'deck',
                'superstructure',
                'substructure',            
               ]

# define list Columns
def recreate(timeseries, columns, listcolumns):
    dfDict = defaultdict(list)
    for key, record in zip(timeseries.keys(), timeseries.values()):
        for col in columns:
            val = record[col][-1]
            dfDict[col].append(val)

        for col in listColumns:
            val = record[col]
            dfDict[col].append(val)
    return dfDict
        
renameCols = {
            'structureNumber': "Structure Number",
            "year": "Year",
            "yearBuilt": "Year Built",
            "longitude": "Longitude",
            "latitude": "Latitude",
            "averageDailyTraffic":"ADT",
            #"structureType":"Structure Type",
            "structureLength":"Structure Length",
            "deckBDSScore":"deck BDS",
            "owner": "Owner",
            "superstructureBDSScore": "superstructure BDS",
            "substructureBDSScore": "substructure BDS",
            "deckDeteriorationScore": "deck Det",
            "superstructureDeteriorationScore": "superstructure Det",
            "substructureDeteriorationScore": "substructure Det",
            "deck": "Deck",
            "superstructure": "Superstructure",
            "substructure": "Substructure"
    }

# Create rename columns
NE_ts_dict = recreate(NE_ts, columns, listColumns)
NE_ts_df = pd.DataFrame(NE_ts_dict)
NE_ts_df.rename(columns=renameCols, inplace=True)

# Calculate the median value of the age.
NE_ts_df['Age'] = NE_ts_df['Year'] - NE_ts_df['Year Built']
NE_ts_df.to_csv('nebraska-all-years.csv')

# Train - set
NE_train_dict = recreate(NE_train_ts, columns, listColumns)
NE_train_ts_df = pd.DataFrame(NE_train_dict)
NE_train_ts_df.rename(columns=renameCols, inplace=True)
NE_train_ts_df['Age'] = NE_train_ts_df['Year'] - NE_train_ts_df['Year Built']


# Ground - set
NE_ground_dict = recreate(NE_ground_ts, columns, listColumns)
NE_ground_ts_df = pd.DataFrame(NE_ground_dict)
NE_ground_ts_df.rename(columns=renameCols, inplace=True)
NE_ground_ts_df['Age'] = NE_ground_ts_df['Year'] - NE_ground_ts_df['Year Built']


## Comparison of age between the training and ground truth set

In [57]:
columns = ['yearBuilt', 'year']
medianAges = []
for key, record in zip(NE_train_ts.keys(), NE_train_ts.values()):
    vals = []
    for col in columns:
        val = record[col]
        vals.append(val)
    lastAge = np.array(vals[1][:-1]) - np.array(vals[0][:-1])
    try:
        lastAge = lastAge[-1]
        medianAges.append(lastAge)
    except:
        pass
    #print("Printing Last Age", lastAge[-1])
    
    
medianAgesG = []
for key, record in zip(NE_ground_ts.keys(), NE_ground_ts.values()):
    vals = []
    for col in columns:
        val = record[col]
        vals.append(val)
    lastAge = np.array(vals[1]) - np.array(vals[0])
    lastAge = lastAge[-1]
    if lastAge != 2019:
        #print("Printing Last Age", lastAge)
        medianAgesG.append(lastAge)


In [59]:
fig = go.Figure()
#fig = make_subplots(rows=1, cols=2)
fig.add_trace(
        go.Box(y=medianAges,
              name='Training Age',
              marker_color='pink'),
              #row=1, col=1,   
)

fig.add_trace(
        go.Box(y=medianAgesG,
              name='Ground Truth Age',
              marker_color='darkblue'),
              #row=1, col=2,
            
)

fig.update_layout(height=600,
                  width=800,
                  title_text="Distribution of Ages in Training and GroundTruth test")
fig.show()

## Comparison of deterioration slope between the training and ground truth set

In [63]:
""""
The deterioration scores should be negative as,
the slopes indicates the decrease in the condition ratings.

TODO:
    1. Calculate slope scores for segements
    2. Validate the algorithm
"""
fig = go.Figure()
#fig = make_subplots(rows=1, cols=2)
fig.add_trace(
        go.Box(y=NE_train_ts_df['deck Det'],
              name='Deck Deterioration - Training set',
              marker_color='pink'),
              #row=1, col=1,   
)

fig.add_trace(
        go.Box(y=NE_ground_ts_df['deck Det'],
              name='Deck Deterioration - Ground Truth set',
              marker_color='darkblue'),
              #row=1, col=2,
            
)

fig.update_layout(height=600,
                  width=800,
                  title_text="Distribution of slope values")
fig.show()