In [1]:
import pandas as pd
import nannyml as nml
from IPython.display import display

In [2]:
df = pd.read_csv('../data/01_raw/house-pricing.csv')

In [3]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df["YrSold"].value_counts()

2009    338
2007    329
2006    314
2008    304
2010    175
Name: YrSold, dtype: int64

In [6]:

def create_timestamp_column(df, column_name_year, column_name_month):
    df['timestamp'] = pd.to_datetime(df[column_name_year].astype(str) + '-' + df[column_name_month].astype(str), format='%Y-%m')
    
    return df

In [7]:
df = create_timestamp_column(df, column_name_year="YrSold", column_name_month="MoSold")

In [8]:
def filter_rows_by_years(df, years, months=None):
    filtered_df = df[df['timestamp'].dt.year.isin(years)]
    
    if months:
        filtered_df = filtered_df[filtered_df['timestamp'].dt.month.isin(months)]
    
    return filtered_df

In [9]:
reference = filter_rows_by_years(df, [2007, 2008])
analysis = filter_rows_by_years(df, [2009, 2010])

In [10]:
first_half_2006 = filter_rows_by_years(df, [2006], [1, 2, 3, 4, 5, 6])
second_half_2006 = filter_rows_by_years(df, [2006], [7, 8, 9, 10, 11, 12])

In [11]:
first_quarter_2008 = filter_rows_by_years(df, [2008], [1, 2, 3])
second_quarter_2008 = filter_rows_by_years(df, [2008], [4, 5, 6])

In [12]:
feature_column_names = ["MSSubClass",
                        "MSZoning",
                        "LotFrontage",
                        "LotArea",
                        "Street"
                        ]

In [13]:
def calculate_drift(reference, analysis, feature_column_names):  
    calc = nml.DataReconstructionDriftCalculator(column_names=feature_column_names,
                                                 timestamp_column_name='timestamp'
                                                 )
    calc.fit(reference)

    results = calc.calculate(analysis)

    return results

In [14]:
result_years = calculate_drift(reference, analysis, feature_column_names)
result_2006 = calculate_drift(first_half_2006, second_half_2006, feature_column_names)
result_2008 = calculate_drift(first_quarter_2008, second_quarter_2008, feature_column_names)

In [15]:
display(result_years.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:50],0,0,50,2009-01-01,2009-04-01,analysis,0.112286,0.630272,0.96713,0.293413,1.056872,0.466109,False
1,[51:101],1,51,101,2009-04-01,2009-05-01,analysis,0.112286,0.778354,1.115212,0.441496,1.056872,0.466109,False
2,[102:152],2,102,152,2009-05-01,2009-06-01,analysis,0.112286,0.828765,1.165624,0.491907,1.056872,0.466109,False
3,[153:203],3,153,203,2009-06-01,2009-07-01,analysis,0.112286,0.79145,1.128308,0.454591,1.056872,0.466109,False
4,[204:254],4,204,254,2009-07-01,2009-09-01,analysis,0.112286,0.657601,0.99446,0.320743,1.056872,0.466109,False
5,[255:305],5,255,305,2009-09-01,2009-11-01,analysis,0.112286,0.787139,1.123997,0.45028,1.056872,0.466109,False
6,[306:356],6,306,356,2009-11-01,2010-02-01,analysis,0.112286,0.712281,1.04914,0.375423,1.056872,0.466109,False
7,[357:407],7,357,407,2010-02-01,2010-04-01,analysis,0.112286,0.716112,1.05297,0.379253,1.056872,0.466109,False
8,[408:458],8,408,458,2010-04-01,2010-05-01,analysis,0.112286,0.798214,1.135072,0.461355,1.056872,0.466109,False
9,[510:509],9,459,512,2010-07-01,2010-07-01,analysis,0.109123,1.106609,1.433977,0.779242,1.056872,0.466109,True


In [16]:
display(result_years.filter(period='reference').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:62],0,0,62,2007-01-01,2007-04-01,reference,0.101028,0.607666,0.910749,0.304582,1.056872,0.466109,False
1,[63:125],1,63,125,2007-04-01,2007-06-01,reference,0.101028,0.809938,1.113022,0.506855,1.056872,0.466109,False
2,[126:188],2,126,188,2007-06-01,2007-07-01,reference,0.101028,0.922155,1.225239,0.619072,1.056872,0.466109,False
3,[189:251],3,189,251,2007-07-01,2007-08-01,reference,0.101028,0.735069,1.038153,0.431986,1.056872,0.466109,False
4,[252:314],4,252,314,2007-08-01,2007-12-01,reference,0.101028,0.660695,0.963779,0.357612,1.056872,0.466109,False
5,[315:377],5,315,377,2007-12-01,2008-04-01,reference,0.101028,0.830604,1.133687,0.527521,1.056872,0.466109,False
6,[378:440],6,378,440,2008-04-01,2008-06-01,reference,0.101028,0.702914,1.005997,0.39983,1.056872,0.466109,False
7,[441:503],7,441,503,2008-06-01,2008-07-01,reference,0.101028,0.655071,0.958154,0.351987,1.056872,0.466109,False
8,[504:566],8,504,566,2008-07-01,2008-09-01,reference,0.101028,0.863431,1.166515,0.560348,1.056872,0.466109,False
9,[630:629],9,567,632,2008-12-01,2008-12-01,reference,0.098705,0.827365,1.12348,0.53125,1.056872,0.466109,False


In [17]:
figure = result_years.plot()

figure.show()

In [18]:
display(result_2006.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:14],0,0,14,2006-07-01,2006-07-01,analysis,0.168451,0.519768,1.02512,0.014416,1.401847,0.164572,False
1,[15:29],1,15,29,2006-07-01,2006-07-01,analysis,0.168451,0.644202,1.149554,0.13885,1.401847,0.164572,False
2,[30:44],2,30,44,2006-07-01,2006-07-01,analysis,0.168451,0.658476,1.163828,0.153125,1.401847,0.164572,False
3,[45:59],3,45,59,2006-07-01,2006-07-01,analysis,0.168451,0.761452,1.266804,0.2561,1.401847,0.164572,False
4,[60:74],4,60,74,2006-07-01,2006-08-01,analysis,0.168451,0.747791,1.253143,0.24244,1.401847,0.164572,False
5,[75:89],5,75,89,2006-08-01,2006-08-01,analysis,0.168451,0.842335,1.347687,0.336983,1.401847,0.164572,False
6,[90:104],6,90,104,2006-09-01,2006-09-01,analysis,0.168451,1.136312,1.641664,0.63096,1.401847,0.164572,False
7,[105:119],7,105,119,2006-10-01,2006-10-01,analysis,0.168451,0.93746,1.442812,0.432108,1.401847,0.164572,False
8,[120:134],8,120,134,2006-10-01,2006-11-01,analysis,0.168451,0.747105,1.252457,0.241754,1.401847,0.164572,False
9,[150:149],9,135,156,2006-12-01,2006-12-01,analysis,0.139094,1.269163,1.686444,0.851883,1.401847,0.164572,False


In [19]:
display(result_2006.filter(period='reference').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:14],0,0,14,2006-01-01,2006-02-01,reference,0.168451,0.607799,1.113151,0.102447,1.401847,0.164572,False
1,[15:29],1,15,29,2006-02-01,2006-03-01,reference,0.168451,0.73656,1.241911,0.231208,1.401847,0.164572,False
2,[30:44],2,30,44,2006-03-01,2006-04-01,reference,0.168451,1.260991,1.766343,0.755639,1.401847,0.164572,False
3,[45:59],3,45,59,2006-04-01,2006-04-01,reference,0.168451,0.63211,1.137462,0.126758,1.401847,0.164572,False
4,[60:74],4,60,74,2006-04-01,2006-05-01,reference,0.168451,0.929396,1.434748,0.424044,1.401847,0.164572,False
5,[75:89],5,75,89,2006-05-01,2006-05-01,reference,0.168451,0.821846,1.327198,0.316495,1.401847,0.164572,False
6,[90:104],6,90,104,2006-05-01,2006-05-01,reference,0.168451,0.579582,1.084934,0.07423,1.401847,0.164572,False
7,[105:119],7,105,119,2006-05-01,2006-06-01,reference,0.168451,0.630819,1.13617,0.125467,1.401847,0.164572,False
8,[120:134],8,120,134,2006-06-01,2006-06-01,reference,0.168451,0.976836,1.482188,0.471484,1.401847,0.164572,False
9,[150:149],9,135,156,2006-06-01,2006-06-01,reference,0.139094,0.656158,1.073438,0.238877,1.401847,0.164572,False


In [20]:
figure = result_2006.plot()

figure.show()

In [21]:
display(result_2008.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:10],0,0,10,2008-04-01,2008-04-01,analysis,0.189154,0.634351,1.201812,0.066891,1.363152,0.095338,False
1,[11:21],1,11,21,2008-04-01,2008-04-01,analysis,0.189154,0.574249,1.141709,0.006788,1.363152,0.095338,False
2,[22:32],2,22,32,2008-04-01,2008-05-01,analysis,0.189154,0.557086,1.124547,-0.010374,1.363152,0.095338,False
3,[33:43],3,33,43,2008-05-01,2008-05-01,analysis,0.189154,0.642779,1.21024,0.075318,1.363152,0.095338,False
4,[44:54],4,44,54,2008-05-01,2008-05-01,analysis,0.189154,0.481625,1.049086,-0.085836,1.363152,0.095338,False
5,[55:65],5,55,65,2008-05-01,2008-06-01,analysis,0.189154,0.810685,1.378146,0.243224,1.363152,0.095338,False
6,[66:76],6,66,76,2008-06-01,2008-06-01,analysis,0.189154,0.704347,1.271808,0.136886,1.363152,0.095338,False
7,[77:87],7,77,87,2008-06-01,2008-06-01,analysis,0.189154,0.515579,1.083039,-0.051882,1.363152,0.095338,False
8,[88:98],8,88,98,2008-06-01,2008-06-01,analysis,0.189154,0.649356,1.216817,0.081895,1.363152,0.095338,False
9,[110:109],9,99,114,2008-06-01,2008-06-01,analysis,0.156838,0.551033,1.021547,0.08052,1.363152,0.095338,False


In [22]:
display(result_2008.filter(period='reference').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:3],0,0,3,2008-01-01,2008-01-01,reference,0.313676,0.874616,1.815644,-0.066411,1.363152,0.095338,False
1,[4:7],1,4,7,2008-01-01,2008-01-01,reference,0.313676,1.050603,1.991631,0.109576,1.363152,0.095338,False
2,[8:11],2,8,11,2008-01-01,2008-01-01,reference,0.313676,0.48967,1.430697,-0.451358,1.363152,0.095338,False
3,[12:15],3,12,15,2008-01-01,2008-02-01,reference,0.313676,0.572675,1.513702,-0.368353,1.363152,0.095338,False
4,[16:19],4,16,19,2008-02-01,2008-02-01,reference,0.313676,0.899352,1.840379,-0.041675,1.363152,0.095338,False
5,[20:23],5,20,23,2008-02-01,2008-03-01,reference,0.313676,0.513362,1.454389,-0.427666,1.363152,0.095338,False
6,[24:27],6,24,27,2008-03-01,2008-03-01,reference,0.313676,0.827792,1.768819,-0.113236,1.363152,0.095338,False
7,[28:31],7,28,31,2008-03-01,2008-03-01,reference,0.313676,0.831431,1.772458,-0.109597,1.363152,0.095338,False
8,[32:35],8,32,35,2008-03-01,2008-03-01,reference,0.313676,0.860443,1.80147,-0.080585,1.363152,0.095338,False
9,[40:39],9,36,40,2008-03-01,2008-03-01,reference,0.28056,0.372504,1.214185,-0.469176,1.363152,0.095338,False


In [23]:
figure = result_2008.plot()

figure.show()