**Import Libraries**

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

**Clean Calendar Data**

In [3]:
def calendar(x):

    
    #change titles to lower case 
    lower = []
    for i in x.columns:
        if ' ' in i:
            i = i.replace(' ', '_')
            lower.append(i.lower())
        else:    
            lower.append(i.lower())
    x.columns = lower
    
    
    #Available to upper case 
    x['available'] = x['available'].str.upper()
    
    #price - convert to euros
    def conversion(x):
        x = x.str.strip('$')
        x = x.replace(',', '', regex = True)
        x = x.astype(float)
        x = x*0.99
        return x
    
    x['price'] = conversion(x['price'])
    x['adjusted_price'] = conversion(x['adjusted_price'])    
    
    x = x.rename(columns={'price':'price_€','adjusted_price':'adjusted_price_€'})

    x['minimum_nights'] = x['minimum_nights'].fillna(x['minimum_nights'].mean())
    x['maximum_nights'] = x['maximum_nights'].fillna(x['maximum_nights'].mean())

    #change datatypes
    x['minimum_nights'] = x['minimum_nights'].astype(int)
    x['maximum_nights'] = x['maximum_nights'].astype(int)
    
    #deal with date column and extract day, month and year
    x['date'] = pd.to_datetime(x['date'])
    x['day'] = pd.DatetimeIndex(x['date']).day
    x['month'] = pd.DatetimeIndex(x['date']).month
    x['year'] = pd.DatetimeIndex(x['date']).year
    
    x = x.drop(columns=['date'])
    
    x = x[x['available'] == 'F']
    
    
    return x

**Import files and apply the function**

In [4]:
s21 = pd.read_csv('raw_data/AirBnBData/10Sep21/calendar (1).csv.gz', compression = 'gzip')
d21 = pd.read_csv('raw_data/AirBnBData/8Dec21/calendar (1).csv.gz', compression = 'gzip')
m22 = pd.read_csv('raw_data/AirBnBData/10Mar22/calendar (1).csv.gz', compression = 'gzip')
j22 = pd.read_csv('raw_data/AirBnBData/7Jun22/calendar (1).csv.gz', compression = 'gzip')
s21.head(2)

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6369,2021-09-11,t,$60.00,$60.00,1.0,1125.0
1,6369,2021-09-12,t,$60.00,$60.00,1.0,1125.0


In [5]:
s = calendar(s21)
d = calendar(d21)
m = calendar(m22)
j = calendar(j22)
print('Sep21 Shape:',s.shape)
print('Dec21 Shape:',d.shape)
print('Mar22 Shape:',m.shape)
print('Jun22 Shape:',j.shape)
s.head(2)

Sep21 Shape: (3965673, 9)
Dec21 Shape: (3862130, 9)
Mar22 Shape: (3859610, 9)
Jun22 Shape: (3976048, 9)


Unnamed: 0,listing_id,available,price_€,adjusted_price_€,minimum_nights,maximum_nights,day,month,year
180,6369,F,59.4,59.4,1,1125,10,3,2022
181,6369,F,59.4,59.4,1,1125,11,3,2022


**Concatenate, Drop Duplicates**

In [6]:
year = pd.concat([s,d,m,j], axis = 0, ignore_index = True)
year.shape

(15663461, 9)

In [7]:
year.duplicated().sum()

5308712

In [8]:
##We drop duplicates, as some of the calendar info may well have been in multiple files, this is booking information of the next year, after all. 
year = year.drop_duplicates(keep='last')
year

Unnamed: 0,listing_id,available,price_€,adjusted_price_€,minimum_nights,maximum_nights,day,month,year
0,6369,F,59.40,59.40,1,1125,10,3,2022
1,6369,F,59.40,59.40,1,1125,11,3,2022
2,6369,F,59.40,59.40,1,1125,12,3,2022
3,6369,F,59.40,59.40,1,1125,13,3,2022
4,6369,F,59.40,59.40,1,1125,14,3,2022
...,...,...,...,...,...,...,...,...,...
15663456,310333,F,84.15,84.15,4,1125,5,11,2022
15663457,310333,F,84.15,84.15,4,1125,6,11,2022
15663458,310333,F,84.15,84.15,4,1125,7,11,2022
15663459,310333,F,84.15,84.15,4,1125,8,11,2022


*Export information to .csv - We don't need this in this project, as we will be looking at data on a monthly scale. It could help, however, just in case we want to look further into Airbnb in the future* 

In [9]:
#year.to_csv('AirBnBCalendarBrut.csv', index = False)

**Create different df with month totals (less lines, more measurable in this case)**

In [10]:
df = year.groupby(['listing_id','month'])['price_€'].sum().reset_index()
df

Unnamed: 0,listing_id,month,price_€
0,6369,1,4594.59
1,6369,2,3809.52
2,6369,3,5683.59
3,6369,4,5281.65
4,6369,5,4317.39
...,...,...,...
240034,643414385697746856,9,289.08
240035,643421822212479135,6,975.15
240036,643421822212479135,7,1333.53
240037,643421822212479135,8,378.18


In [11]:
#determine the neccessary columns to be merged and drop any duplicates to avoid a many-to-many join.
year_no_days = year[['listing_id','available','price_€','minimum_nights','maximum_nights','year']]
year_no_days = year_no_days.drop_duplicates()

In [12]:
#join dataframes on  the listing id on a monthly scale. 
final = pd.merge(year_no_days,df, how='inner', on='listing_id')
final = final.rename(columns={'price_€_x':'price_per_night_€','price_€_y':'monthly_earning_€'})
final

Unnamed: 0,listing_id,available,price_per_night_€,minimum_nights,maximum_nights,year,month,monthly_earning_€
0,6369,F,59.4,1,1125,2022,1,4594.59
1,6369,F,59.4,1,1125,2022,2,3809.52
2,6369,F,59.4,1,1125,2022,3,5683.59
3,6369,F,59.4,1,1125,2022,4,5281.65
4,6369,F,59.4,1,1125,2022,5,4317.39
...,...,...,...,...,...,...,...,...
5602927,272279,F,108.9,5,45,2023,7,1806.75
5602928,272279,F,108.9,5,45,2023,9,891.00
5602929,272279,F,108.9,5,45,2023,10,2638.35
5602930,272279,F,108.9,5,45,2023,11,2722.50


In [13]:
#final.to_csv('AirBnBCalendar_Monthly.csv',index = False)

**Create a mean monthly earning per listing to be able to compare to rent prices**

In [14]:
total = final.groupby(['listing_id'])['monthly_earning_€'].mean().reset_index()
total

Unnamed: 0,listing_id,monthly_earning_€
0,6369,4756.867500
1,21853,813.450000
2,23001,1210.000000
3,24805,4127.557500
4,26825,477.887143
...,...,...
24930,643316305721462080,1631.025000
24931,643368304270742048,5874.000000
24932,643414385697746856,1317.937500
24933,643421822212479135,895.620000


In [15]:
#total.to_csv('airbnb_year_rent_av.csv', index = False)