# 2nd Stage Panel Data Cleaning
This second data cleaning notebook cleans the output csv of the first round of data cleaning by applying less economically non-controversial cleaning methods. It also provides some visualizations of the cleaned data thus far.

---------

# 0. Import and Setup Universal Directory

In [2]:
import numpy as np
import pandas as pd
import os
import operator 
import datetime
import quantecon as qe
import matplotlib.pyplot as plt

In [3]:
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Make sure repository has a 2. Clean data and Saved data folders!
csv_save_path = cwd2 + '/Saved data'
# Revert to preliminary directory
os.chdir(csv_save_path)

In [None]:
#Preliminary formatting
dateCols =['last_scraped', 'host_since', 'first_review', 'last_review', 'scrape_batch']
df = pd.read_csv('1stStageClean_SAN.csv', low_memory=False, parse_dates=dateCols)

df = df[df.columns.drop(list(df.filter(regex='NORl')))] 

-----

# 1. 1st Stage Cleaned Data Visualization

In [None]:
dates = df.groupby('scrape_batch')['List_month'].sum().index
counts = df.groupby('scrape_batch')['List_month'].sum().values

count_max = counts.max()
count_min = counts.min()
count_std = counts.std()

plt.plot(dates, counts)
plt.ylim(0, count_max + 300)
plt.xticks(rotation="vertical")
plt.show()

print(round(count_max/count_min - 1, 2))

--------------------

# 2. Cleaning Part 1

In [None]:
def create_lagsleads(var, lag_range, df, title):
    """
    This function creates lag and lead variables for a given variable and dataframe.    
    """
    df = df.sort_values(by = ['id', 'month'])
    
    for i in range(-lag_range, lag_range + 1):
        
        if i == 0:
            continue
        
        if i < 0:
            df.loc[:, title + "lead" + str(abs(i)) ] = df.groupby('id')[var].shift(i)
                
        if i > 0: 
            df.loc[:, title + "lag" + str(abs(i)) ] = df.groupby('id')[var].shift(i)
            
    return df

In [None]:
Confidence_cutoff = 1.960 

df.loc[:, 'corrected_NOR'] = df.groupby(['id'])['number_of_reviews'].rolling(19, min_periods=1).max().reset_index(level=0, drop=True)
df = create_lagsleads('corrected_NOR', 12, df, "NOR") 
df.loc[:, "NOR_diff"] = df['corrected_NOR'] - df['NORlag1']

# This bound is arbitrary 
bounds = df.groupby('id')['NOR_diff'].mean() + Confidence_cutoff*df.groupby('id')['NOR_diff'].std()/np.sqrt(df.groupby('id')['List_month'].sum())
df.loc[:, "NOR_diff_bound"] = bounds[df['id']].values

In [None]:
def timedelta_formatter(var):
    """
    This function formats the time delta for a passed variable.
    """
    df.loc[:, var] = pd.to_timedelta(df[var]).dt.days
    
for deltas in ['days_since_rev', 'days_since_first_rev', 'host_length']:
    timedelta_formatter(deltas)

In [None]:
def test_var_change(var, cutoff, relate, df):
    """
    This function produces ids where the variable of interest changes.
    """
    ops = {'>': operator.gt,
       '<': operator.lt,
       '>=': operator.ge,
       '<=': operator.le,
       '==': operator.eq}
    
    # Take ids and variable of interest and drop any na's
    repetition_arr = np.array(df[['id', var]].dropna().drop_duplicates()) # Need drop_duplicates to identify actual price changes
    counts = np.unique(repetition_arr[:,0], return_counts = True)
    return counts[0][ops[relate](counts[1], cutoff)], counts[1][ops[relate](counts[1], cutoff)]

In [None]:
change_ids, change_counts = test_var_change('property_type', 2, '>=', df)

In [None]:
# This gives the location of the first month hosted
first_host_ind = df.groupby('id').List_month.idxmax()
df.loc[:, "first_appearance"] = (df.index == first_host_ind[df['id']]).astype(float)

In [None]:
# This gives the location of the last month hosted
df_list = df[df['List_month'] == 1]
last = df_list.groupby('id')['month'].last()

df.loc[:, 'last_app'] = (df['month'].values == last[df['id']].values).astype(float)

In [None]:
# Visualization of when new properties show up in the data
plt.plot(df.groupby('scrape_batch')['last_app'].sum()[1:-1], label='Sum last appearance')
plt.plot(df.groupby('scrape_batch')['first_appearance'].sum()[1:-1], label='Sum first appearances')
plt.title('Count of first and last appearances of Airbnbs on platform in San Francisco (excluding first and last month, raw data)')
plt.ylim(90,1200)
plt.xticks(rotation="vertical")
plt.show()

In [None]:
# Calculate cumulative listings for the host
host_cumlists = df.groupby(['host_id', 'month'])['first_appearance'].sum().unstack().cumsum(axis=1).stack().astype(int)
host_cumlists.name = 'cum_sum'
df = df.join(host_cumlists, on=['host_id', 'month'], rsuffix='_cumsum')

# Calculate some other summary statistics about host holdings
df = df.join(df.groupby(['host_id', 'month'])['List_month'].sum(), on=['host_id', 'month'], rsuffix='_byhost_month')
df = df.join(df.groupby(['host_id'])['List_month'].sum(), on=['host_id'], rsuffix='_host_overall')
df = df.join(df.groupby(['id'])['List_month'].sum(), on=['id'], rsuffix='_id_overall')

In [None]:
# Identify hotels in my data
df.loc[:, 'hotel_dum'] = np.array((df['property_type'] == "Boutique hotel") | (df['property_type'] == "Bed and breakfast") | (df['property_type'] == "Boutique hotel") | (df['property_type'] == "Aparthotel")| (df['property_type'] == "Hotel")| (df['property_type'] == "Resort")| (df['property_type'] == "Serviced apartment") )*1

In [None]:
plt.plot(df.groupby('scrape_batch')['hotel_dum'].sum()[1:])
plt.title("Count of hotels on Airbnb in San Francisco over time (raw data)")
plt.ylim(0, 1500)
plt.show()

In [None]:
df.loc[:,"entrant"] =  np.array((df['first_appearance'] == 1) & (df['days_since_first_rev'] < 30 ) & (df['number_of_reviews'] < 10 ))*1

In [None]:
plt.plot(df.groupby('scrape_batch')['entrant'].sum()[1:])
plt.title("Count of 'entrants' onto San Francisco Airbnb over time (raw data)")
plt.show()

In [None]:
df = df.join(df.groupby(['neighbourhood', 'month'])['List_month'].sum(), on=['neighbourhood', 'month'], rsuffix='_byneigh')

In [None]:
df = df.join(df.sort_values(by=['neighbourhood', 'month']).groupby(['neighbourhood', 'month'])['List_month'].sum().shift(1), on=['neighbourhood', 'month'], rsuffix='_lag_byneigh')
df.loc[:,'List_month_lag_byneigh'] =  df['List_month_lag_byneigh'].mask(df['month'] == 3, np.nan)

In [None]:
df.sort_values(by=['neighbourhood', 'month'])[['id', 'month', 'neighbourhood', 'List_month_byneigh', 'List_month_lag_byneigh']]

------

# 3. Drop Indicators 

### No reviews


In [None]:
df.loc[:, 'drop_indicator'] = 0

max_NORdiff = df.groupby('id')['NOR_diff'].max()
df.loc[:, 'max_NORdiff'] = max_NORdiff[df['id']].values

no_revs_ind = (df['max_NORdiff'] == 0).values*1
df.loc[:, 'drop_indicator'] = df['drop_indicator'].replace({ 0: no_revs_ind})

df['drop_indicator'].sum()

### Extremely low or high prices

In [None]:
price_1per = df.price.quantile(.01)
price_99per = df.price.quantile(.99)

low_price = (df.groupby('id')['price'].min()[df['id']].values < price_1per)*1
high_price = (df.groupby('id')['price'].min()[df['id']].values > price_99per)*1

df.loc[:, 'drop_indicator'] = df['drop_indicator'].replace({ 0: low_price})
df.loc[:, 'drop_indicator'] = df['drop_indicator'].replace({ 0: high_price})

df['drop_indicator'].sum()

### Never state a day of availability 

In [None]:
never_avail = (df.groupby('id')['availability_365'].max()[df['id']].values == 0)*1
df.loc[:, 'drop_indicator'] = df['drop_indicator'].replace({ 0: never_avail})

df['drop_indicator'].sum()

### Minimum nights

In [None]:
long_term_rental = (df.groupby(['id'])['minimum_nights'].min()[df['id']].values > 30)*1
df.loc[:, 'drop_indicator'] = df['drop_indicator'].replace({ 0: long_term_rental})
df['drop_indicator'].sum()

### Hotel indicator

In [None]:
hotel_ind = (df.groupby(['id'])['hotel_dum'].max()[df['id']].values == 1)*1
df.loc[:, 'drop_indicator'] = df['drop_indicator'].replace({ 0: hotel_ind})

df['drop_indicator'].sum()

In [None]:
print(len(df[df['drop_indicator'] == 1]))
print(len(df[df['drop_indicator'] == 0]))

-----

# 4. Finalize and Save CSV

In [None]:

df.to_csv('2ndStageClean_SAN.csv', index=False, date_format='%Y-%m-%d %H:%M:%S')