In [1]:
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

In [2]:
names = ['04October', '02November', '04December', '04January', '05February', '06March', 
         '05April', '03May', '05June', '03July', '04August', '05September']
dataframes = {}

for name in names:
    # Define the file path
    file_path = 'data/calendar/' + name + '.csv.gz'
    # Use gzip.open to decompress the file and then read it with Pandas
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        data = pd.read_csv(file)

    dataframes[name] = data

In [8]:
for name in names:
    print(name)
    dataframes[name]['date'] = pd.to_datetime(dataframes[name]['date'])
    start_date = dataframes[name]['date'].min()
    end_date = dataframes[name]['date'].max()
    print(f"Date range: {start_date} to {end_date}")
    print("")

04October
Date range: 2022-10-05 00:00:00 to 2023-10-06 00:00:00

02November
Date range: 2022-11-02 00:00:00 to 2023-11-07 00:00:00

04December
Date range: 2022-12-04 00:00:00 to 2023-12-04 00:00:00

04January
Date range: 2023-01-04 00:00:00 to 2024-01-10 00:00:00

05February
Date range: 2023-02-05 00:00:00 to 2024-02-06 00:00:00

06March
Date range: 2023-03-06 00:00:00 to 2024-03-05 00:00:00

05April
Date range: 2023-04-05 00:00:00 to 2024-04-04 00:00:00

03May
Date range: 2023-05-03 00:00:00 to 2024-05-09 00:00:00

05June
Date range: 2023-06-05 00:00:00 to 2024-06-04 00:00:00

03July
Date range: 2023-07-03 00:00:00 to 2024-07-01 00:00:00

04August
Date range: 2023-08-04 00:00:00 to 2024-08-03 00:00:00

05September
Date range: 2023-09-05 00:00:00 to 2024-09-04 00:00:00



In [9]:
column_type = dataframes[names[0]]['date'].dtype

print(column_type)

datetime64[ns]


In [12]:
end_dates = ['2022-11-01', '2022-12-03', '2023-01-03', '2023-02-04', '2023-03-05', 
             '2023-04-04', '2023-05-02', '2023-06-04', '2023-07-02', '2023-08-03', '2023-09-04', '2023-09-30']

final_dataset = pd.DataFrame()

for i in range(len(end_dates)):
    temporarly_dataset = dataframes[names[i]]
    temporarly_dataset = temporarly_dataset.loc[temporarly_dataset['date'] <= end_dates[i]]
    final_dataset = pd.concat([temporarly_dataset, final_dataset], ignore_index=True)

In [13]:
final_dataset

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,2595,2023-09-05,t,$240.00,$240.00,30.0,1125.0
1,2595,2023-09-06,t,$240.00,$240.00,30.0,1125.0
2,2595,2023-09-07,t,$240.00,$240.00,30.0,1125.0
3,2595,2023-09-08,t,$240.00,$240.00,30.0,1125.0
4,2595,2023-09-09,t,$240.00,$240.00,30.0,1125.0
...,...,...,...,...,...,...,...
14927308,44578886,2022-10-28,f,$120.00,$120.00,3.0,1125.0
14927309,44578886,2022-10-29,f,$120.00,$120.00,3.0,1125.0
14927310,44578886,2022-10-30,f,$120.00,$120.00,3.0,1125.0
14927311,44578886,2022-10-31,f,$120.00,$120.00,3.0,1125.0


In [14]:
# We see that every date is included, but not every host has included leap year
listing_counts = final_dataset.groupby('listing_id')['date'].count().reset_index(name='row_count')
listing_counts['row_count'].unique()

array([ 28, 355, 349, 358, 323,  59, 326, 357, 350, 352, 351, 356, 293,
       117, 354, 148, 144,  60, 353,  89, 232, 348, 150,  56, 145, 180,
       331, 359, 206,  31, 330, 264,  57, 229, 324, 266, 332, 237, 347,
       325, 233, 333, 207, 328, 269, 262, 120, 177, 318, 149, 322, 236,
       300, 327,  90, 179, 178, 210, 297, 329, 298, 209, 208, 239, 301,
       360, 238, 211, 175, 274, 213, 240,  32,  58,  29,  63, 143, 121,
       243, 153, 265, 118, 181, 122, 119,  64, 114,  88,  26,  30, 268,
       291, 321, 320,  91, 260,  82,  86, 140, 295, 230, 270, 299, 212,
       176, 288,  61, 294, 201, 204, 267, 171, 205, 296,  62, 241, 319,
       234,  33, 174, 235, 152, 146, 289, 261, 151,  92, 346, 124, 242,
       292, 263, 116, 173, 259, 147, 112, 214, 303, 202, 272, 182, 172,
       199,  84, 203, 287,  85,  54, 271, 183, 273,  87, 290, 244,  80,
       302,  25, 125, 123, 184, 126, 275, 286, 115, 245, 316, 258,  94,
       231, 142, 228, 304, 200, 113, 345,  27, 170, 317,  93, 16