# Load and Explore Data

Load and explore the data

In [92]:
import pandas as pd
import plotly.graph_objects as go

## Fetch data

In [93]:
# Load Excel file in Data/Clean_McBroken_Daily.xlsx
df = pd.read_excel('Data/Clean_McBroken_Daily.xlsx')
df

Unnamed: 0,Date,Broken Machines,Total Machines,Percent Broken,Revenue Losses,Outlier,Train
0,2020-10-25,354.0,6818.0,5.192138,221250.0,False,True
1,2020-10-26,373.0,6844.0,5.450029,233125.0,False,True
2,2020-10-27,427.0,7488.0,5.702457,266875.0,False,True
3,2020-10-28,391.0,8294.0,4.714251,244375.0,False,True
4,2020-10-29,374.0,7461.0,5.012733,233750.0,False,True
...,...,...,...,...,...,...,...
1578,2025-02-19,1190.0,11835.0,10.054922,743750.0,False,False
1579,2025-02-20,1207.0,11990.0,10.066722,754375.0,False,False
1580,2025-02-21,1110.0,11907.0,9.322247,693750.0,False,False
1581,2025-02-22,1184.0,12119.0,9.769783,740000.0,False,False


## Outliers

In [94]:
# Mark outliers
# Any day dropping more than X% from 7 day moving average of Total Machines
df['7DMA'] = df['Total Machines'].rolling(window=7).mean()
df['Outlier'] = df['Total Machines'] < df['7DMA'] * 0.8
# At any case that is an outlier, mark the 7DMA in another column
df['Outlier 7DMA'] = df['7DMA']
df.loc[~df['Outlier'], 'Outlier 7DMA'] = None
# Forward-fill Outlier 7DMA
df['Outlier 7DMA'] = df['Outlier 7DMA'].fillna(method='ffill')
# Iterate through rows
# If a row is an outlier, turn on flag to continue marking following rows as outliers
# Turn off flag when Total Machines gets back above 7DMA
flag = False
for i, row in df.iterrows():
    if row['Outlier']:
        flag = True
    if flag:
        df.at[i, 'Outlier'] = True
    if row['Total Machines'] > row['Outlier 7DMA']:
        flag = False
    
# August 12 to September 9, 2024
df.loc[(df['Date'] >= '2024-08-12') & (df['Date'] <= '2024-09-09'), 'Outlier'] = True
# September 9, 2022
df.loc[df['Date'] == '2022-09-09', 'Outlier'] = True
# October 6, 2021
df.loc[df['Date'] == '2021-10-06', 'Outlier'] = True


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [95]:
# Create the figure
fig = go.Figure()

# Add the line trace
fig.add_trace(go.Scatter(x=df['Date'], y=df['Broken Machines'], mode='lines', name='Broken Machines'))

# Iterate through the data and add a shape for each outlier day
for i in range(len(df)):
    if df['Outlier'][i]:
        # Shade the *entire* day.  Important for time series!
        date_fmt = pd.to_datetime(df['Date'][i])
        start_date = date_fmt.strftime('%Y-%m-%d 00:00:00') # Start of the day
        end_date = date_fmt.strftime('%Y-%m-%d 23:59:59')   # End of the day
        fig.add_shape(
            type="rect",
            x0=start_date,  # Start of the day
            x1=end_date,    # End of the day
            y0=0,
            y1=5000,
            line=dict(width=0),  # No border
            fillcolor="rgba(255, 0, 0, 0.2)",  # Red with 20% opacity
            layer="below"  # Place the rectangle behind the line
        )


# Improve layout (optional)
fig.update_layout(
    title="Line Chart with Outlier Shading",
    xaxis_title="Date",
    yaxis_title="Value",
    xaxis_range=[min(pd.to_datetime(df['Date'])) - pd.Timedelta(days=1), max(pd.to_datetime(df['Date'])) + pd.Timedelta(days=1)] # Add some padding to x-axis
)

fig.show()


In [96]:
# List of outliers after July 1, 2021 and before Sept 1, 2021
outliers = df[df['Outlier'] & (df['Date'] > '2021-07-01') & (df['Date'] < '2021-09-01')]
outliers

Unnamed: 0,Date,Broken Machines,Total Machines,Percent Broken,Revenue Losses,Outlier,Train,7DMA,Outlier 7DMA
255,2021-07-07,127.0,2276.0,5.579965,79375.0,True,True,13102.142857,13102.142857
256,2021-07-08,153.0,2586.0,5.916473,95625.0,True,True,11203.0,11203.0
257,2021-07-09,144.0,2329.0,6.182911,90000.0,True,True,9434.142857,9434.142857
258,2021-07-10,170.0,2948.0,5.766621,106250.0,True,True,7753.571429,7753.571429
259,2021-07-11,330.0,3950.0,8.35443,206250.0,True,True,6216.285714,6216.285714
260,2021-07-12,407.0,4052.0,10.044423,254375.0,True,True,4693.285714,6216.285714
261,2021-07-13,361.0,4184.0,8.628107,225625.0,True,True,3189.285714,6216.285714
262,2021-07-14,715.0,9843.0,7.264046,446875.0,True,True,4270.285714,6216.285714
