In [1]:
import pymongo
from pymongo import MongoClient
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
from sklearn import preprocessing
import warnings
from nbi1 import *

warnings.filterwarnings('ignore')

## Importing data

In [2]:
# Importing NBI Data
states =  ['31']  
years  =  [2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
fields =  {"_id":0, 
                         "year":1,           # year of the survey
                         "stateCode":1,      
                         "countyCode":1,
                         "structureNumber":1,
                         "yearBuilt":1,
                         "yearReconstructed":1,
                         "averageDailyTraffic":1,
                         "deck":1,
                         "substructure":1, ## rating of substructure
                         "superstructure":1, ## rating of superstructure
                         "owner":1,
                         "Structure Type":"$structureTypeMain.kindOfMaterialDesign",
                         "Type of Wearing Surface":"$wearingSurface/ProtectiveSystem.deckProtection",
         }

db = "bridges"
connection_string = "mongodb://research:superSMART1%3A%3A@ist177a-mongo.ist.unomaha.edu/admin"
survey_records = getSurveyRecords(states, years, fields, db, connection_string)

# Data Filteration
survey_records, before_filter, after_filter =  filterSurveyData(survey_records)

# Adding Column: Age
survey_records['Age'] = survey_records['year'] - survey_records['yearBuilt']

KeyboardInterrupt: 

## Data Cleaning and Preparation 

In [6]:
## Create timeseries of every bridge from the NBI dataset.
survey_timeseries = createTimeseriesLifeCycle(survey_records)

## Identify rebuild of the bridges in timeseries and split survey records accordingly.
survey_timeseries = splitSurveyRecords(survey_timeseries)

## Identify inconsistency in aging of the bridges.
split_profiles = createSplitProfiles(survey_timeseries)

## Split bridge records again, where the inconsistencies are identified.
survey_timeseries = splitBackward(survey_timeseries, split_profiles) 

##
split_profiles_CR = createCRSplitProfiles(survey_timeseries, 3)

# ## Counting and renaming the splits of the bridges.
# survey_timeseries_split_structureNumbers = splitStructureNumbers(survey_timeseries)

# ## Combining the all splits of the survey records 
# survey_timeseries = combinedStructureNumberWithRecords(survey_timeseries_split_structureNumbers, survey_timeseries)

# ## Introducing each split of the bridge as a separate bridge.
# survey_timeseries = createIndividualRecords(survey_timeseries)

In [7]:
s

[['00000000000A048',
  [[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]],
  [['6', '6', '6', '6', '6', '6', '5', '5', '5', '5']],
  [['6', '6', '6', '6', '6', '6', '5', '5', '5', '5']],
  [['6', '6', '6', '6', '6', '6', '5', '5', '5', '5']],
  [['31', '31', '31', '31', '31', '31', '31', '31', '31', '31']],
  [[1967, 1967, 1967, 1967, 1967, 1967, 1967, 1967, 1967, 1967]],
  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
 ['00000000000A100',
  [[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]],
  [['8', '8', '8', '8', '8', '8', '7', '7', '7', '7']],
  [['7', '7', '7', '7', '7', '7', '7', '7', '7', '7']],
  [['8', '8', '8', '8', '8', '8', '7', '7', '7', '7']],
  [['31', '31', '31', '31', '31', '31', '31', '31', '31', '31']],
  [[1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978, 1978]],
  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]],
 ['00000000000A106',
  [[28, 29, 30, 31, 32, 33, 34, 35, 36, 37]],
  [['8', '8', '8', '8', '8', '8', '7', '7', '7', '7']],
  [['8', '8', '8', '8', '8', '8', '7', '7', '7', '7']],
  [['6', '6',

### Creating Dataframe

In [5]:
### Creating Dataframe from 2x9 list of "survey_timeseries".
survey_timeseries_df = pd.DataFrame(survey_timeseries,  columns=['Structure Number', 'Age', 'Deck', 'Superstructure','Substructure', 'State Code','Year Built','Year Reconstructed'])

### flattening columns
survey_timeseries_df['State Code'] = [str(statecode[0]) for statecode in survey_timeseries_df['State Code']]

### shape of the dataframe
print("Shape of the dataframe: ",survey_timeseries_df.shape)

##E Printing sample of the dataframe.
survey_timeseries_df.head()

Shape of the dataframe:  (13037, 8)


Unnamed: 0,Structure Number,Age,Deck,Superstructure,Substructure,State Code,Year Built,Year Reconstructed
0,00000000000A048_1,"[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]","[6, 6, 6, 6, 6, 6, 5, 5, 5, 5]","[6, 6, 6, 6, 6, 6, 5, 5, 5, 5]","[6, 6, 6, 6, 6, 6, 5, 5, 5, 5]",31,"[1967, 1967, 1967, 1967, 1967, 1967, 1967, 196...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,00000000000A100_1,"[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]","[8, 8, 8, 8, 8, 8, 7, 7, 7, 7]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[8, 8, 8, 8, 8, 8, 7, 7, 7, 7]",31,"[1978, 1978, 1978, 1978, 1978, 1978, 1978, 197...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,00000000000A106_1,"[28, 29, 30, 31, 32, 33, 34, 35, 36, 37]","[8, 8, 8, 8, 8, 8, 7, 7, 7, 7]","[8, 8, 8, 8, 8, 8, 7, 7, 7, 7]","[6, 6, 6, 6, 6, 6, 7, 7, 7, 7]",31,"[1980, 1980, 1980, 1980, 1980, 1980, 1980, 198...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,00000000000A107_1,"[27, 28, 29, 30, 31, 32, 33, 34, 35, 36]","[8, 8, 8, 8, 8, 8, 7, 7, 7, 7]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[7, 7, 7, 7, 6, 6, 6, 6, 6, 6]",31,"[1981, 1981, 1981, 1981, 1981, 1981, 1981, 198...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,00000000000A113_1,"[25, 26, 27, 28, 29, 30, 31, 32, 33, 34]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[8, 8, 8, 8, 8, 8, 7, 7, 7, 7]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7]",31,"[1983, 1983, 1983, 1983, 1983, 1983, 1983, 198...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


### Without Outliers

In [12]:
survey_timeseries_no_outliers_df.to_csv("02-20-19-thesis-dataset-without-outliers")

### Without Outliers

In [11]:
survey_timeseries_df.to_csv("02-20-19-thesis-dataset")