# Create dataset

In [1]:
import pandas as pd
import os
from python.listingspreprocessing import ListingsPreprocessing
from python.reviewspreprocessing import ReviewsPreprocessing
from python.datepreprocessing import DatePreprocessing

In [2]:
def merge_listings(folders):
    
    l = ListingsPreprocessing('../data/'+folders[0]+'/listings.csv').do_preprocessing('clean')
    
    for f in folders[1:]:
        l_new = ListingsPreprocessing('../data/'+f+'/listings.csv').do_preprocessing('clean')
        l = pd.concat([l, l_new],ignore_index=True)
    
    l.drop_duplicates(subset='id', keep='last', inplace=True)
    l.sort_values(by = 'id', inplace = True)
        
    return l

folders = [x for x in os.listdir('../data/')]
l_merged = merge_listings(folders).reset_index(drop = True)

In [6]:
def merge_reviews(folders):
    

    r = ReviewsPreprocessing('../data/'+folders[0]+'/reviews.csv').do_preprocessing('clean')
    
    for f in folders[1:]:
        r_new = ReviewsPreprocessing('../data/'+f+'/reviews.csv').do_preprocessing('clean')
        r = pd.concat([r, r_new],ignore_index=True)
    
    r.drop_duplicates(keep='last', inplace=True)
    r['comments'] = r['comments'].astype(str)
    r = r.groupby(by='listing_id')['comments'].sum().reset_index()
    r.sort_values(by = 'listing_id', inplace = True)
        
    return r

folders = [x for x in os.listdir('../data/')]
r_merged = merge_reviews(folders).reset_index(drop = True)

In [7]:
def merge_calendars(folders):
    
    def clean_calendar(fl):
        df = pd.read_csv(fl).dropna().drop(columns='available').reset_index(drop=True)
        df = df[['listing_id','date','price']] #only keep these three columns
        df['price'] =  df['price'].apply(lambda x: x.replace('$','').replace(',','')).astype('float')   
        df['date'] = pd.to_datetime(df['date'])
        return df
    
    df = clean_calendar('../data/'+folders[0]+'/calendar.csv')
    for f in folders[1:]:
        df = pd.concat([df,clean_calendar('../data/'+f+'/calendar.csv')],ignore_index=True)
    df = df.groupby(by=['listing_id','date'])['price'].max().reset_index()
    return df

folders = [x for x in os.listdir('../data/')]
d_merged = merge_calendars(folders)

In [10]:
r_extracted = ReviewsPreprocessing('').extract_feature(r_merged, merged=True)

l_extracted = ListingsPreprocessing('').extract_feature(l_merged)

d_extracted = DatePreprocessing('').extract_feature(d_merged)

In [11]:
data = pd.merge(l_extracted, r_extracted,
        left_on='id',right_on='listing_id',
        how='inner').drop(columns = 'listing_id')

data = pd.merge(data.drop(columns='price'),
            d_extracted,left_on='id',right_on='listing_id',
            how='inner').drop(columns = 'listing_id')

In [18]:
data.to_csv('../save/all_data.csv', index=False)