In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU

In [2]:
def clean_calendar(fl):
    """
    read a file in csv format, fl is filename
    drop NaN rows and 'available' column
    change price from $1,200 to 1200(float)
    change the type of date
    return a cleaned dataframe with listing_id, date and price
    """
    df = pd.read_csv(fl).dropna().drop(columns='available').reset_index(drop=True)
    df = df[['listing_id','date','price']] #only keep these three columns
    df['price'] =  df['price'].apply(lambda x: x.replace('$','').replace(',','')).astype('float')   
    df['date'] = pd.to_datetime(df['date'])
    return df

In [24]:
def merge_calendars(folders):
    """
    merge all calendars together; if listings have the same id and date, keep the highest price
    """
    df = clean_calendar('../data/'+folders[0]+'/calendar.csv')
    for f in folders[1:]:
        df = pd.concat([df,clean_calendar('../data/'+f+'/calendar.csv')],ignore_index=True)
    df = df.groupby(by=['listing_id','date'])['price'].max().reset_index()
    return df
    

In [28]:
if __name__=='__main__':
    folders = [x for x in os.listdir('../data/')]
    df = merge_calendars(folders)