# 1st Stage Panel Data Cleaning

This notebook performs the first round of less controversial data cleaning tasks on the final long form data produced from the aggregate data notebook. These tasks include creating leads and lags, general formatting, filling N/A values, and identifying missing neighborhoods. It then saves the cleaned data into a csv.

# Helpful Links and Documentation
* **Reverse-geocoder:** https://github.com/thampiman/reverse-geocoder, https://pypi.org/project/reverse_geocoder/
* **Filling in missing values:** https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
* **Label encoding:** https://pbpython.com/categorical-encoding.html
* **mpu for distance:** https://stackoverflow.com/questions/41336756/find-the-closest-latitude-and-longitude
* **Filled forward with Numpy:** https://stackoverflow.com/questions/41190852/most-efficient-way-to-forward-fill-nan-values-in-numpy-array
* **Passing an operator to a Python function:** https://stackoverflow.com/questions/18591778/how-to-pass-an-operator-to-a-python-function

In [2]:
import numpy as np
import pandas as pd
import os
import operator # This allows one to pass operators into a Python function
import datetime
import mpu # For distance calculation
from scipy import stats
import quantecon as qe
import reverse_geocoder as rg #for neighborhood identification
import matplotlib.pyplot as plt

_______

# Preliminary Cleaning

In [3]:
km_per_mi = 1.60934 #for conversion

In [47]:
# Universal directory setup
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

# Make sure repository has a 2. Clean data and Saved data folders!
csv_raw_path = cwd2 + '/2. Clean data'
csv_save_path = cwd2 + '/Saved data'
csv_GOOGLE_save = cwd2 + '/Saved data'
# Revert to preliminary directory
os.chdir(cwd1)

In [14]:
# Read concatenated data
os.chdir(csv_raw_path)

listings_df = pd.read_csv('Data_longALL_v1.csv.gz', low_memory=False)

# Switch to other folder for saving data
os.chdir(csv_save_path)

### Destringing prices

In [15]:
def destring_price(var):
    """
    Destrings a passed variable.
    """
    listings_df.loc[:, var] = listings_df[var].replace('[\$,]', '', regex=True).astype(float)

for var in ['price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people']:
    destring_price(var)

### Date formatting


In [5]:
def format_dates(var):
    """
    This function converts date variables into datetime format.
    """
    listings_df.loc[:, var] = pd.to_datetime(listings_df[var])
    
for date_vars in ['last_scraped', 'host_since', 'first_review', 'last_review']:
    format_dates(date_vars)
    
# Set the 'scrape_batch' as a modal date for a file being scraped in a CSV

for m in listings_df['month'].unique():
    listings_df.loc[listings_df['month'] == m, 'scrape_batch'] = listings_df[listings_df['month'] == m]['last_scraped'].mode().values[0]
    listings_df.loc[:, 'scrape_batch'] = pd.to_datetime(listings_df['scrape_batch'])
    
# Create a YR, MO value for the scrape batch, this is largely used for graphing where one needs to aggregate by year-month
listings_df.loc[:,"batch_YRMO"] = pd.to_datetime(listings_df['scrape_batch']).dt.to_period('M')    


def dates_diff(var_name, var1, var2):
    """
    Computes the difference between two date variables and assigns
    the difference to a new variable of given name.
    """
    listings_df.loc[:, var_name] = listings_df[var1] - listings_df[var2]

dates_diff('days_since_rev', 'last_scraped', 'last_review')
dates_diff('days_since_first_rev', 'last_scraped', 'first_review')
dates_diff('host_length', 'last_scraped', 'host_since')

NameError: name 'pd' is not defined

### Leads and lag creation

In [17]:
def create_lagsleads(var, lag_range, df, title):
    
    """
    This function creates lag variables within a given range 
    for a given variable within a given dataframe. The title of these lag 
    variables is specified by title.
    """
    
    df = df.sort_values(by = ['id', 'month'])
    
    for i in range(-lag_range, lag_range + 1):
        
        if i == 0:
            continue
        
        if i < 0:
            df.loc[:, title + "lead" + str(abs(i)) ] = df.groupby('id')[var].shift(i)
                
        if i > 0: 
            df.loc[:, title + "lag" + str(abs(i)) ] = df.groupby('id')[var].shift(i)
            
    return df

listings_df = create_lagsleads('List_month', 12, listings_df, "List") # This would be much faster in numpy...
listings_df = create_lagsleads('availability_60', 12, listings_df, "avail60")
listings_df = create_lagsleads('availability_90', 12, listings_df, "avail90")
listings_df = create_lagsleads('availability_365', 12, listings_df, "avail365")
# listings_df = create_lagsleads('number_of_reviews', 12, listings_df, "NOR") 

In [18]:
# Additional formatting

listings_df = listings_df.reset_index(drop=True)

listings_df.loc[:, 'max_lag'] = listings_df.groupby(['id'])['number_of_reviews'].rolling(4).max().reset_index(level=0, drop=True)
listings_df['max_lag'].fillna(listings_df['number_of_reviews'], inplace=True)
listings_df.loc[:, 'fixed_lag'] = listings_df.groupby('id')['max_lag'].shift(1)
listings_df.loc[:, "NOR_diff"] = listings_df['max_lag'] - listings_df['fixed_lag']


----------

# Exploring missing data

In [2]:
# This function checks what the difference between the number of unique ids and the number of ids paired with some
# property characteristic. The key thing here is to see whether or not a listing trait changes

def checking_missing_data(var):
    """
    This function checks what the difference between the number of unique ids and the 
    number of ids paired with some property characteristic. 
    The key use case is to see whether or not a listing trait changes.
    """
    
    ids = listings_df[['id']].drop_duplicates().dropna()
    ids = np.array(ids)

    paired_ids = listings_df[['id', var]].dropna().drop_duplicates()
    paired_ids = np.array(paired_ids)
    
    if len(ids) == len(paired_ids):
        print('No change in variable')
    else:
        print("Variable changes")
              
#     return set(ids[:,0]) - set(paired_ids[:,0])

In [20]:

def identify_variable_changes(var, cutoff, relate, df):
    """
    This function produces ids where the variable of interest changes.
    """
    ops = {'>': operator.gt,
       '<': operator.lt,
       '>=': operator.ge,
       '<=': operator.le,
       '==': operator.eq}
    
    # Take ids and variable of interest and drop any na's
    repetition_arr = np.array(df[['id', var]].dropna().drop_duplicates()) # Need drop_duplicates to identify actual price changes
    counts = np.unique(repetition_arr[:,0], return_counts = True)
    return counts[0][ops[relate](counts[1], cutoff)], counts[1][ops[relate](counts[1], cutoff)]

In [21]:
change_vars, change_counts = identify_variable_changes('longitude', 7, ">=", listings_df)

In [22]:
change_vars

array([ 472901., 1392204., 7309811., 8053887., 8446662., 9284412.])

In [23]:
listings_df[listings_df['id']==19490974]['zipcode']

Series([], Name: zipcode, dtype: object)

----------

# Filling in N/A's

This section of the code fills missing data in forwards and backwards for values that would be expected to be invariant over time such as fixed property features and host characteristics, such as when the host first started using Airbnb.

In [24]:
# Take the modal zip codes as a property's zip code
modal_zips = listings_df.groupby('id')['zipcode'].agg(lambda x: stats.mode(x)[0][0])
listings_df.loc[:, 'zipcode'] = modal_zips[listings_df['id']].values

### Assumption: Properties do not change features when they are no observed
Forward fill, then back fill. This means older properties have priority. This is an assumption! 

In [25]:
# This loop takes 4 minutes.

qe.util.tic()

#Fill loop
for var in ['host_id', 'host_name', 'host_since', 'host_location', 'property_type', 'room_type', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'first_review', 'instant_bookable']:
    listings_df.loc[:, var] = listings_df.groupby(['id'])[var].fillna(method='ffill', axis=0)
    listings_df.loc[:, var] = listings_df.groupby(['id'])[var].fillna(method='bfill', axis=0)
    
qe.util.toc()

TOC: Elapsed: 0:02:16.61


136.61671566963196

In [26]:
def create_dummies(var):
    """
    Creates a dummy variable for a given variable
    """
    listings_df.loc[:, var] = listings_df[var].astype('category')
    listings_df.loc[:, var + "_dum"] = listings_df[var].cat.codes

In [27]:
categorical_vars = ['host_is_superhost', 'room_type', 'instant_bookable', 'zipcode']

#creates dummy variables for all categories
for cat in categorical_vars:
    create_dummies(cat)

-----------

# Neighborhood categorization
This section attempts to classify each property as belonging to a specific neighborhood.

### Finding appropriate neighborhoods

In [28]:
# How many total listings there are 
print("Total listings:")

print(len(listings_df['neighbourhood'].astype('category')))

print("-------------------------")
print("Active listings with neighborhood reported:")
print(len(listings_df[(listings_df['List_month']==1) & (~listings_df['neighbourhood'].isna())]))

print("-------------------------")
print("Active listings with no neighborhood reported:")
print(len(listings_df[(listings_df['List_month']==1) & (listings_df['neighbourhood'].isna())]))

Total listings:
1790982
-------------------------
Active listings with neighborhood reported:
416799
-------------------------
Active listings with no neighborhood reported:
21792


In [29]:
# This code replaces each listing's neighborhood with its modal neighborhood
modal_neighs = listings_df.groupby('id')['neighbourhood'].agg(lambda x: stats.mode(x)[0][0])
listings_df.loc[:, 'neighbourhood'] = modal_neighs[listings_df['id']].values

# Creates a dummy for whether neighborhood was initially reported
listings_df.loc[:, 'missing_neigh'] = (listings_df['neighbourhood'] == 0)*1

### Average the reported longitude and latitude

In [30]:
id_avg_lat, id_avg_lon = listings_df.groupby('id')['latitude'].mean(), listings_df.groupby('id')['longitude'].mean()

listings_df.loc[:,'avg_lat'] = np.array(id_avg_lat[(listings_df['id'].values)])
listings_df.loc[:, 'avg_lon'] = np.array(id_avg_lon[(listings_df['id'].values)])

In [31]:
# Set-up a dataframe of all of the listings missing a neighborhood
missing_neigh = listings_df[(listings_df['List_month'] == 1) & (listings_df['neighbourhood'] == 0) & (~listings_df['latitude'].isna()) & (~listings_df['longitude'].isna())][['id', 'avg_lat', 'avg_lon']]
missing_neigh = missing_neigh.drop_duplicates()
missing_neigh = missing_neigh.sort_index()
missing_neigh = missing_neigh.reset_index(drop = False)

# Dataframe of all the listings, that are not missing a neighborhood
not_missing_neigh = listings_df[(listings_df['List_month'] == 1) & (listings_df['neighbourhood'] != 0) & (~listings_df['latitude'].isna()) & (~listings_df['longitude'].isna())][['id', 'neighbourhood','avg_lat', 'avg_lon']]
not_missing_neigh = not_missing_neigh.drop_duplicates()

In [32]:
missing_neigh

Unnamed: 0,index,id,avg_lat,avg_lon
0,37526,340586,37.771045,-122.463453
1,46001,464078,37.787714,-122.445258
2,64555,666434,37.790655,-122.406108
3,67109,696724,37.809686,-122.366324
4,67164,696756,37.789434,-122.406579
...,...,...,...,...
1929,1458965,24851531,37.801871,-122.407755
1930,1729955,38073895,37.828790,-122.376610
1931,1790684,40539735,37.801010,-122.415900
1932,1790858,40547706,37.817040,-122.369990


In [33]:
def distance(point1, point2):
    """
    Calculate distance between two points
    """
    return mpu.haversine_distance(point1, point2)

print(distance((30.170165, -97.756954), (30.277500,-97.713975))/km_per_mi)

def closest(data, this_point):
    """
    Applies the distance function to each element in the data, 
    then returns the observation with the lowest distance.
    """
    return min(data, key=lambda x: distance(this_point,x))

7.8474924451354475


In [34]:
coords_with_neigh = np.array(not_missing_neigh[['avg_lat', 'avg_lon']])
coords_no_neigh = np.array(missing_neigh[['avg_lat', 'avg_lon']])

### Neighborhood identification

In [35]:
# This code takes around 8 minutes.

qe.util.tic()

approx_neighs = []

for i in coords_no_neigh:
    approx_neighs.append(not_missing_neigh[(not_missing_neigh['avg_lat'] == closest(tuple(coords_with_neigh), tuple(i))[0]) & (not_missing_neigh['avg_lon'] == closest(tuple(coords_with_neigh), tuple(i))[1])]['neighbourhood'].values[0])

missing_neigh['neighbourhood'] = approx_neighs
missing_neigh.to_csv('approximated_neighs.csv', index=False)

qe.util.toc()

TOC: Elapsed: 0:06:11.31


371.31531143188477

In [36]:
missing_neigh['neighbourhood'] = pd.read_csv('approximated_neighs.csv')['neighbourhood']

In [37]:
listings_df[listings_df['id'] == 8971967]['neighbourhood']

Series([], Name: neighbourhood, dtype: object)

In [38]:
listings_df.loc[:, 'neighbourhood'][missing_neigh['index'].values] = missing_neigh['neighbourhood'].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_df.loc[:, 'neighbourhood'][missing_neigh['index'].values] = missing_neigh['neighbourhood'].values


In [39]:
listings_df[listings_df['id'] == 8971967]['neighbourhood']

Series([], Name: neighbourhood, dtype: object)

In [40]:
# Replace all of the 0's with NaN's
listings_df.loc[:, 'neighbourhood'] = listings_df['neighbourhood'].replace({0: np.nan})

# Copy the neighborhood over the whole sample
listings_df.loc[:,'neighbourhood'] = listings_df.groupby(['id'])['neighbourhood'].fillna(method='ffill', axis=0)
listings_df.loc[:, 'neighbourhood'] = listings_df.groupby(['id'])['neighbourhood'].fillna(method='bfill', axis=0)

In [41]:
listings_df[listings_df['id'] == 8971967][['neighbourhood', 'avg_lat', 'avg_lon']]

Unnamed: 0,neighbourhood,avg_lat,avg_lon


In [42]:
create_dummies('neighbourhood')

-------

# Calendar Formatting

In [43]:

cal_update = listings_df['calendar_updated'].str.split(" ", n=2, expand=True)
cal_update.columns = ['count', 'units', 'numeric']

cal_update = cal_update[['count', 'units']] # Drop the third column
cal_update.loc[:, "count"] = cal_update['count'].replace({"today": 0, "a":1, "yesterday":0, "never":9999}).astype(float)
cal_update.loc[:, "units"] = cal_update['units'].replace({"days": 1, "None":1, "weeks":7, "months":30, "week":7}).astype(float)
cal_update.loc[:, 'days'] = cal_update['count']*cal_update['units']

cal_update.loc[cal_update['count']==0.0, "days"] = 0.0
cal_update.loc[cal_update['count']==9999, "days"] = 9999

In [44]:
listings_df.loc[:, 'days_since_calup'] = cal_update['days']
del cal_update

In [45]:
len(listings_df[listings_df['NOR_diff'] < 0]) + len(listings_df[listings_df['NOR_diff'] >= 0]) + len(listings_df[listings_df['NOR_diff'].isna()])

1790982

In [46]:
listings_df.head()

Unnamed: 0.1,Unnamed: 0,id,month,List_month,last_scraped,host_id,host_name,host_since,host_location,host_response_time,...,NOR_diff,host_is_superhost_dum,room_type_dum,instant_bookable_dum,zipcode_dum,missing_neigh,avg_lat,avg_lon,neighbourhood_dum,days_since_calup
0,6284,958,0,1,2015-09-02,1169.0,Holly,2008-07-31,"San Francisco, California, United States",within a day,...,,0,0,0,30,0,37.76931,-122.433853,15,0.0
1,37163,958,1,1,2015-11-01,1169.0,Holly,2008-07-31,"San Francisco, California, United States",within a day,...,2.0,0,0,0,30,0,37.76931,-122.433853,15,0.0
2,68042,958,2,1,2015-12-02,1169.0,Holly,2008-07-31,"San Francisco, California, United States",within a day,...,1.0,0,0,0,30,0,37.76931,-122.433853,15,0.0
3,98921,958,3,1,2016-02-02,1169.0,Holly,2008-07-31,"San Francisco, California, United States",within a day,...,3.0,0,0,0,30,0,37.76931,-122.433853,15,0.0
4,129800,958,4,1,2016-04-03,1169.0,Holly,2008-07-31,"San Francisco, California, United States",within a day,...,5.0,0,0,0,30,0,37.76931,-122.433853,15,0.0


____

# Save to csv

In [48]:
# Saves file to the file location, 'csv_GOOGLE_save', specified in universal directory
os.chdir(csv_GOOGLE_save)
listings_df.to_csv('1stStageClean_AUS.csv', index=False, date_format='%Y-%m-%d %H:%M:%S')