In [54]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

## Import datasets

In [19]:
flowChart = pd.read_csv("../../../data/nbi/TrueFlFalseRf.csv")
randomForest = pd.read_csv("../../../data/nbi/TrueRfFalseFl.csv")

# Adding attributes
randomForest['label'] = ['True - RF & False - FL']*len(randomForest)
flowChart['label'] = ['True - FL & False - RF']*len(flowChart)

# Concat two dataframes
df = pd.concat([randomForest, flowChart])

Unnamed: 0,year,stateCode,structureNumber,countyCode,yearBuilt,averageDailyTraffic,deck,yearReconstructed,avgDailyTruckTraffic,material,structureType,label
0,2018,31,C000622910,11,1935,20,7,1990,0,3,2,True - FL & False - RF
1,2018,31,C001402720,27,1935,30,5,2002,0,1,2,True - FL & False - RF
2,2018,31,C004501305P,89,2003,20,8,0,0,5,4,True - FL & False - RF
3,2018,31,C006311110,125,1974,75,7,0,0,3,2,True - FL & False - RF
4,2018,31,S030 00591,105,1991,495,9,2014,12,5,4,True - FL & False - RF
5,2018,31,S030 00920,105,1991,495,9,2014,12,2,1,True - FL & False - RF
6,2018,31,S092 46106,155,1960,7640,9,1990,8,3,2,True - FL & False - RF


## Number of Bridges

In [53]:
models =['True RandomForest-False FlowChart', 'True FlowChart-False Random Forest']
fig = go.Figure([go.Bar(x=models, y=[randomForest.shape[0], flowChart.shape[0]])])
fig.show()

## Comparison of models with respect to year built

In [46]:
fig = px.histogram(df, x="yearBuilt", color="label", marginal="rug", # can be `box`, `violin`
                         hover_data=df.columns)
fig.show()

## Comparison of models with respect to county Code

In [56]:
attribute = 'countyCode'
x1 = randomForest[attribute]
x2 = flowChart[attribute]
hist_data = [x1, x2]

group_labels = ['True RandomForest', 'True FlowChart']
colors = ['#A56CC1', '#A6ACEC']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=.2, show_rug=False)

# Add title
text = 'Comparison of Models in predicting bridges with respect to '+attribute
fig.update_layout(title_text=text)
fig.show()

## Comparison of models with respect to deck

In [60]:
attribute = 'deck'
x1 = randomForest[attribute]
x2 = flowChart[attribute]
hist_data = [x1, x2]

group_labels = ['True RandomForest', 'True FlowChart']
colors = ['#A56CC1', '#A6ACEC']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=.2, show_rug=False)

# Add title
text = 'Comparison of Models in predicting bridges with respect to '+attribute
fig.update_layout(title_text=text)
fig.show()

In [47]:
fig = px.box(df, x="label", y="deck")
fig.show()

## Comparison of models with respect to material

In [65]:
fig = px.bar(df, x='material', height=400, color='label')
fig.show()

In [67]:
fig = px.bar(df, x='deck', height=400, color='label')
fig.show()

## Comparison of models with respect to structureType

In [66]:
fig = px.bar(df, x='structureType', height=400, color='label')
fig.show()

In [71]:
# mean deck rating
fl = np.mean(flowChart['deck'])
rf = np.mean(randomForest['deck'])
print(fl, rf) # flowchart: 7, randomForest: 38

# t-test
from scipy.stats import ttest_ind

# flowchart: True, RandomForest:False,  label: True
fl = (flowChart['deck'])

# flowchart: False, RandomForest:True,  label: True
rf = (randomForest['deck'])
ttest_ind(fl, rf)

7.714285714285714 6.552631578947368


Ttest_indResult(statistic=2.5978505684515705, pvalue=0.012794882306643147)