# 1st Stage Panel Data Cleaning

This notebook performs the first round of data cleaning tasks on the final long form data produced from the Aggregate Listings Data notebook. These tasks include:

* 1. **Preliminary cleaning:** Includes de-stringing prices, formatting dates and computing variable lags and leads
* 2. **Exploring missing values:** Creates functions for understanding NaN values and fills in time-invariant characteristics of a listing
* 3. **Neighborhood categorization:** Assign neighborhoods to listings based on their GPS coordinates 
* 4. **'Calendar Update' formatting:** Creating numerical measure for how often an Airbnb listing is updated by the property host
* 5. **Additional variable creation:** Additional variables are created for further anaylsis
* 6. **Creating drop criteria for observations:** Primary focus of drop criteria is to drop dormant listings, price outliers and full-time hotels

Once these tasks are complete, the Notebook saves the cleaned data into a compressed csv.gz file.

In [1]:
import numpy as np
import pandas as pd
import os
import operator # This allows one to pass operators into a Python function
import mpu # For distance calculation
from scipy import stats # Used to find modal value of geographic values
import time

In [2]:
# Select city to work with

city_folder = '/united-states_portland'
city_abbrev = 'POR'

In [3]:
# Universal directory setup
cwd1 = os.getcwd() 

# Go up one directory level
os.chdir('..')
cwd2 = os.getcwd()

csv_raw_path = cwd2 + '/1. Download and compile data/'
csv_save_path = cwd2 + '/Saved data/'

# Revert to preliminary directory
os.chdir(cwd1)

In [4]:
# Read concatenated data
os.chdir(csv_raw_path)

listings_df = pd.read_csv(city_abbrev + '_Data_longALL.csv.gz', low_memory=False)

# Switch to other folder for saving data
os.chdir(csv_save_path)

# Show a snapshot of the dataframe
print(listings_df.iloc[:6,:6])

        id  month  List_month last_scraped     host_id       host_name
0  4986792      0           1   2015-09-02   9165660.0         Cynthia
1  3883718      0           1   2015-09-02  13686663.0         Mallory
2  7092722      0           1   2015-09-02  13322905.0           Sarah
3  4475369      0           1   2015-09-02   4006487.0        Kathleen
4  5904142      0           1   2015-09-02  19052765.0  Melanie & Dirk
5  2455288      0           1   2015-09-02  11021785.0             Amy


------
# 1. Preliminary Cleaning

## Destringing prices

In [5]:
def destring_price(var):
    """
    Destrings a passed variable.
    """
    listings_df.loc[:, var] = listings_df[var].replace('[\$,]', '', regex=True).astype(float)

# This loop destrings price variables
for var in ['price', 'weekly_price', 
            'monthly_price', 'security_deposit',
            'cleaning_fee', 'extra_people']:
    
    destring_price(var)

## Date formatting

In [6]:
def format_dates(var):
    """
    This function converts date variables into datetime format.
    """
    listings_df.loc[:, var] = pd.to_datetime(listings_df[var])
    
# ===============================================================    

def dates_diff(var_name, var1, var2):
    """
    Computes the difference between two date variables and assigns
    the difference to a new variable of given name.
    """
    listings_df.loc[:, var_name] = listings_df[var1] - listings_df[var2]

In [7]:
# Create a loop of date formatting
for date_vars in ['last_scraped', 'host_since', 'first_review', 'last_review']:
    format_dates(date_vars)
    
# Set the 'scrape_batch' as a modal date for a file being scraped in a CSV

for m in listings_df['month'].unique():
    listings_df.loc[listings_df['month'] == m, 'scrape_batch'] = listings_df[listings_df['month'] == m]['last_scraped'].mode().values[0]
    listings_df.loc[:, 'scrape_batch'] = pd.to_datetime(listings_df['scrape_batch'])
    
# Create a Year-Month value for the scrape batch, this is largely used for graphing where one needs to aggregate by year-month
listings_df.loc[:,"batch_YRMO"] = pd.to_datetime(listings_df['scrape_batch']).dt.to_period('M')    

# Calculate different date differences
dates_diff('days_since_rev', 'last_scraped', 'last_review')
dates_diff('days_since_first_rev', 'last_scraped', 'first_review')
dates_diff('host_length', 'last_scraped', 'host_since')

In [8]:
def timedelta_formatter(var):
    """
    This function formats the time delta for a passed variable.
    """
    listings_df.loc[:, var] = pd.to_timedelta(listings_df[var]).dt.days
    
for deltas in ['days_since_rev', 'days_since_first_rev', 'host_length']:
    timedelta_formatter(deltas)

## Leads and lag creation

In [9]:
def create_lagsleads(var, lag_range, df, title):
    
    """
    This function creates lag variables within a given range 
    for a given variable within a given dataframe. The title of these lag 
    variables is specified by title.
    """
    
    df = df.sort_values(by = ['id', 'month']).copy()
    
    for i in range(-lag_range, lag_range + 1):
        
        if i == 0:
            continue
        
        if i < 0:
            df.loc[:, title + "lead" + str(abs(i)) ] = df.groupby('id')[var].shift(i).copy()
                
        if i > 0: 
            df.loc[:, title + "lag" + str(abs(i)) ] = df.groupby('id')[var].shift(i).copy()
            
    return df

listings_df = create_lagsleads('List_month', 12, listings_df, "List")
listings_df = create_lagsleads('availability_60', 12, listings_df, "avail60")
listings_df = create_lagsleads('availability_90', 12, listings_df, "avail90")
listings_df = create_lagsleads('availability_365', 12, listings_df, "avail365")

In [10]:
listings_df = listings_df.reset_index(drop=True)

# Count the number of unique months in the concatenated dataset
Nunique_months = len(listings_df['month'].unique())

# Create a clean measure for the number of reviews that prevents reviews from falling over time. This is done 
# so that changes in reviews can be used to get a sense of bookings.
listings_df.loc[:, 'corrected_NOR'] = listings_df.groupby(['id'])['number_of_reviews'].rolling(Nunique_months, min_periods=1).max().reset_index(level=0, drop=True)
listings_df = create_lagsleads('corrected_NOR', 12, listings_df, "NOR") 
listings_df.loc[:, "NOR_diff"] = listings_df['corrected_NOR'] - listings_df['NORlag1']

### Comment on the 'NOR_diff' variable
* The measure for the differences in the number of reviews is imperfect but seems generally reasonable, researchers need to be careful with how it is used. 

* As long as properties appear in consecutive scrapes it seems to work well. If a property does not appear for many scrapes and reviews will sometimes jump significantly when it reappears. 

* When describing the 'NOR_diff' variable, we can observe the maximum number of reviews in a given month is 149! Which is not a reasonable number of reviews for a property to recieve in in 30/31 days. 

* It may be desirable to make an assumption that reviews are evenly across the months in which the property fails to appear in the scrape.

In [11]:
# Describe the 'NOR_diff variable
print(listings_df['NOR_diff'].describe())

# Report 99th percentile of difference in number of reviews
print("----")
print('99th percentile:')
print(np.quantile(listings_df[~(listings_df['NOR_diff'].isna())]['NOR_diff'], 0.99))

count    253504.000000
mean          1.537246
std           3.161950
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max         149.000000
Name: NOR_diff, dtype: float64
----
99th percentile:
14.0


In [12]:
# Example of what the corrected_NOR variable does. Can see that I don't allow the stock of reviews to decline.

print(listings_df[listings_df['id'] == 10886050][['month', 
                                                  'number_of_reviews',
                                                  'corrected_NOR', 'NOR_diff']].iloc[-19:-8])

        month  number_of_reviews  corrected_NOR  NOR_diff
159451     18               51.0           51.0       2.0
159452     19                NaN           51.0       0.0
159453     20                NaN           51.0       0.0
159454     21                NaN           51.0       0.0
159455     22                NaN           51.0       0.0
159456     23               55.0           55.0       4.0
159457     24               51.0           55.0       0.0
159458     25               51.0           55.0       0.0
159459     26               51.0           55.0       0.0
159460     27               59.0           59.0       4.0
159461     28               61.0           61.0       2.0


----------

# 2. Exploring missing data and filling in NaN values

In [13]:
def checking_missing_data(var):
    
    """
    This function checks what the difference between the number of unique ids and the 
    number of ids is paired with some property characteristic. 
    The key use case is to see whether or not a listing trait changes.
    """
    
    ids = listings_df[['id']].drop_duplicates().dropna()
    ids = np.array(ids)

    paired_ids = listings_df[['id', var]].dropna().drop_duplicates()
    paired_ids = np.array(paired_ids)
    
    if len(ids) == len(paired_ids):
        print('No change in variable')
    else:
        print("Variable changes")

In [14]:
def identify_variable_changes(var, cutoff, relate, df):
    
    """
    This function lists ids where the variable of interest changes ("var").
    This can be used for data cleaning purposes.
    """
    
    ops = {'>': operator.gt,
       '<': operator.lt,
       '>=': operator.ge,
       '<=': operator.le,
       '==': operator.eq}
    
    # Take ids and variable of interest and drop any na's
    repetition_arr = np.array(df[['id', var]].dropna().drop_duplicates()) # Need drop_duplicates to identify actual price changes
    counts = np.unique(repetition_arr[:,0], return_counts = True)
    
    return counts[0][ops[relate](counts[1], cutoff)], counts[1][ops[relate](counts[1], cutoff)]

In [15]:
# Example usage of the identify variable changes function

# Store the ids for which the property type changes at least twice
change_ids, change_counts = identify_variable_changes('property_type', 2, '>=', listings_df)

# Unique ids that report changed property types
print("Number of unique ids that changed their official 'property type:'")
print(len(change_ids))

Number of unique ids that changed their official 'property type:'
1637


----------

## Filling in N/A's
This section of the code fills missing data in forwards and backwards for values that would be expected to be invariant over time, such as fixed property features and host characteristics, during times for example when the host first started using Airbnb.

In [16]:
# Take the modal zip codes as a property's zip code
modal_zips = listings_df.groupby('id')['zipcode'].agg(lambda x: stats.mode(x)[0][0])
listings_df.loc[:, 'zipcode'] = modal_zips[listings_df['id']].values

### Assumption: Properties do not change features when they are not observed
Forward fill, then back fill. This means older features have priority.

In [17]:
# Run a loop that fills property characteristics forwards and backwards

my_timer = time.time() # Time it

for var in ['host_id', 'host_name', 'host_since', 'host_location', 'property_type', 'room_type', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'first_review', 'instant_bookable']:
    listings_df.loc[:, var] = listings_df.groupby(['id'])[var].fillna(method='ffill', axis=0)
    listings_df.loc[:, var] = listings_df.groupby(['id'])[var].fillna(method='bfill', axis=0)
    
time_to_run_filler =  time.time() - my_timer

print("Minutes to run filling loop:")
print(time_to_run_filler/60)

Minutes to run filling loop:
1.007814633846283


In [18]:
def create_dummies(var):
    """
    Creates a dummy variable for a given variable
    """
    listings_df.loc[:, var] = listings_df[var].astype('category')
    listings_df.loc[:, var + "_dum"] = listings_df[var].cat.codes

In [19]:
# Creates dummy variables for specified categories

categorical_vars = ['host_is_superhost', 'room_type', 'instant_bookable', 'zipcode']

for cat in categorical_vars:
    create_dummies(cat)

-----------

# 3. Neighborhood categorization
This section assigns each property to a specific neighborhood. There are two primary cases to be worried about:

* 1) **Changing neighborhood:** A property with a changing neighborhood is simply assigned its modal neighborhood. 
* 2) **No neighborhood assigned:** A property without a reported neighborhood is assigned the neighborhood of the closest property that has a neighborhood reported.

## Finding appropriate neighborhoods

In [20]:
print("Total listings:")

print(len(listings_df['neighbourhood'].astype('category')))

print("-------------------------")
print("Active listings with neighborhood reported:")
print(len(listings_df[(listings_df['List_month']==1) & (~listings_df['neighbourhood'].isna())]))

print("-------------------------")
print("Active listings with no neighborhood reported:")
print(len(listings_df[(listings_df['List_month']==1) & (listings_df['neighbourhood'].isna())]))

Total listings:
418359
-------------------------
Active listings with neighborhood reported:
140237
-------------------------
Active listings with no neighborhood reported:
10201


In [21]:
# This code replaces each listing's neighborhood with its modal neighborhood
modal_neighs = listings_df.groupby('id')['neighbourhood'].agg(lambda x: stats.mode(x)[0][0])
listings_df.loc[:, 'neighbourhood'] = modal_neighs[listings_df['id']].values

# Creates a dummy for whether neighborhood was initially reported
listings_df.loc[:, 'missing_neigh'] = (listings_df['neighbourhood'] == 0)*1

## Average the reported longitude and latitude
The longitude and latitude of a given property sometimes changes due to anonymization of Airbnb exact location. We just average these longitudes and latitudes to determine a representative location. 

In [22]:
id_avg_lat, id_avg_lon = listings_df.groupby('id')['latitude'].mean(), listings_df.groupby('id')['longitude'].mean()

listings_df.loc[:,'avg_lat'] = np.array(id_avg_lat[(listings_df['id'].values)])
listings_df.loc[:, 'avg_lon'] = np.array(id_avg_lon[(listings_df['id'].values)])

In [23]:
# Set-up a dataframe of all of the listings missing a neighborhood
missing_neigh = listings_df[(listings_df['List_month'] == 1) & (listings_df['neighbourhood'] == 0) & (~listings_df['latitude'].isna()) & (~listings_df['longitude'].isna())][['id', 'avg_lat', 'avg_lon']]
missing_neigh = missing_neigh.drop_duplicates()
missing_neigh = missing_neigh.sort_index()
missing_neigh = missing_neigh.reset_index(drop = False)

# Dataframe of all the listings, that are not missing a neighborhood
not_missing_neigh = listings_df[(listings_df['List_month'] == 1) & (listings_df['neighbourhood'] != 0) & (~listings_df['latitude'].isna()) & (~listings_df['longitude'].isna())][['id', 'neighbourhood','avg_lat', 'avg_lon']]
not_missing_neigh = not_missing_neigh.drop_duplicates()

# Print first five rows of the missing neighborhood dataframe
missing_neigh.head(5)

Unnamed: 0,index,id,avg_lat,avg_lon
0,1324,67036,45.531952,-122.644825
1,2109,107177,45.496996,-122.745876
2,4516,246398,45.479189,-122.607044
3,5322,281285,45.532832,-122.705484
4,11027,668224,45.532556,-122.777067


In [24]:
km_per_mi = 1.60934 #for distance conversion

In [25]:
def distance(point1, point2):
    """
    Calculate distance between two points
    """
    return mpu.haversine_distance(point1, point2)

def closest(data, this_point):
    """
    Applies the distance function to each element in the data, 
    then returns the observation with the lowest distance.
    """
    return min(data, key=lambda x: distance(this_point,x))

In [26]:
# Test distance function

print(distance((30.170165, -97.756954), (30.277500,-97.713975))/km_per_mi)

7.8474924451354475


In [27]:
coords_with_neigh = np.array(not_missing_neigh[['avg_lat', 'avg_lon']])
coords_no_neigh = np.array(missing_neigh[['avg_lat', 'avg_lon']])

## Neighborhood identification

In [28]:
# Identify neighborhoods for properties with no neighborhood assigned
# NOTE: IF THIS HAS BEEN RUN ONCE, CAN COMMENT OUT AND JUST LOAD IN 'approximated_neighs.csv'

neigh_timer = time.time()

approx_neighs = []

for i in coords_no_neigh:
     approx_neighs.append(not_missing_neigh[(not_missing_neigh['avg_lat'] == closest(tuple(coords_with_neigh), tuple(i))[0]) & (not_missing_neigh['avg_lon'] == closest(tuple(coords_with_neigh), tuple(i))[1])]['neighbourhood'].values[0])

missing_neigh['neighbourhood'] = approx_neighs

# Save the approximate neighborhoods so this code doesn't need to be run again.
missing_neigh.to_csv(city_abbrev + '_approximated_neighs.csv', index=False)

time_to_match = time.time() - neigh_timer

print("Mins to match neighborhoods:")
print(time_to_match/60)

Mins to match neighborhoods:
1.1291101535161336


In [29]:
# If approximated_neighs.csv exists, can load in neighborhoods here.
missing_neigh['neighbourhood'] = pd.read_csv(city_abbrev + '_approximated_neighs.csv')['neighbourhood']

In [30]:
with pd.option_context('mode.chained_assignment',None): # This just suppresses an innocous SettingWithCopy warning

    listings_df.loc[:, 'neighbourhood'][missing_neigh['index'].values] = missing_neigh['neighbourhood'].values.copy()

In [31]:
# Replace all of the 0's with NaN's
listings_df.loc[:, 'neighbourhood'] = listings_df['neighbourhood'].replace({0: np.nan})

# Copy the neighborhood over the whole sample
listings_df.loc[:,'neighbourhood'] = listings_df.groupby(['id'])['neighbourhood'].fillna(method='ffill', axis=0)
listings_df.loc[:, 'neighbourhood'] = listings_df.groupby(['id'])['neighbourhood'].fillna(method='bfill', axis=0)

# Create neighborhood dummies
create_dummies('neighbourhood')

-------

# 4. 'Calendar Update' Formatting

Here we calculate a numeric value for the days since an Airbnb listing's calendar has been updated. This offers a measure for how active the Airbnb property is.

In [32]:
cal_update = listings_df['calendar_updated'].str.split(" ", n=2, expand=True)
cal_update.columns = ['count', 'units', 'numeric']

cal_update = cal_update[['count', 'units']] # Drop the third column
cal_update.loc[:, "count"] = cal_update['count'].replace({"today": 0, "a":1, "yesterday":0, "never":9999}).astype(float)
cal_update.loc[:, "units"] = cal_update['units'].replace({"days": 1, "None":1, "weeks":7, "months":30, "week":7}).astype(float)
cal_update.loc[:, 'days'] = cal_update['count']*cal_update['units']

cal_update.loc[cal_update['count']==0.0, "days"] = 0.0
cal_update.loc[cal_update['count']==9999, "days"] = 9999

# Add the calendar update values to the dataframe
listings_df.loc[:, 'days_since_calup'] = cal_update['days']
del cal_update

In [33]:
# Drop the missing neighborhoods flag
listings_df = listings_df.drop(columns=['missing_neigh'])

# 5. Additional variable creation

## Flag month where a listing is first hosted

In [34]:
first_host_ind = listings_df.groupby('id').List_month.idxmax()
listings_df.loc[:, "first_appearance"] = (listings_df.index == first_host_ind[listings_df['id']]).astype(float)

## Flag month where a listing is last hosted

In [35]:
listings_df_list = listings_df[listings_df['List_month'] == 1]
last = listings_df_list.groupby('id')['month'].last()

listings_df.loc[:, 'last_app'] = (listings_df['month'].values == last[listings_df['id']].values).astype(float)

## Calculate cumulative listings for a given host

In [36]:
host_cumlists = listings_df.groupby(['host_id', 'month'])['first_appearance'].sum().unstack().cumsum(axis=1).stack().astype(int)
host_cumlists.name = 'cum_sum'
listings_df = listings_df.join(host_cumlists, on=['host_id', 'month'], rsuffix='_cumsum')

## Calculate other summary statistics about host holdings

In [37]:
# Host listings per month
listings_df = listings_df.join(listings_df.groupby(['host_id', 'month'])['List_month'].sum(), on=['host_id', 'month'], rsuffix='_byhost_month')

# Host overall listings over the dataset
listings_df = listings_df.join(listings_df.groupby(['host_id'])['List_month'].sum(), on=['host_id'], rsuffix='_host_overall')

# Total times a given property is listed
listings_df = listings_df.join(listings_df.groupby(['id'])['List_month'].sum(), on=['id'], rsuffix='_id_overall')

## Identify hotels in the data

In [38]:
listings_df.loc[:, 'hotel_dum'] = np.array((listings_df['property_type'] == "Boutique hotel") |
                                  (listings_df['property_type'] == "Bed and breakfast") | 
                                  (listings_df['property_type'] == "Boutique hotel") | 
                                  (listings_df['property_type'] == "Aparthotel")| 
                                  (listings_df['property_type'] == "Hotel")| 
                                  (listings_df['property_type'] == "Resort")| 
                                  (listings_df['property_type'] == "Serviced apartment") )*1

## Measure for an entrant to the Airbnb platform

In [39]:
# Create a preliminary measure for an entrant Airbnb listing.
listings_df.loc[:,"entrant"] =  np.array((listings_df['first_appearance'] == 1) &
                                (listings_df['days_since_first_rev'] < 30 ) & 
                                (listings_df['number_of_reviews'] < 10 ))*1

## Calculate listings per neighborhood

In [40]:
# Calculate the number of listings in a neighborhood on the Airbnb platform for a given month
listings_df = listings_df.join(listings_df.groupby(['neighbourhood', 'month'])['List_month'].sum(), 
             on=['neighbourhood', 'month'], rsuffix='_byneigh')

# Calculate the lagged number of listings in a neighborhood on the Airbnb platform for a given month
listings_df = listings_df.join(listings_df.sort_values(by=['neighbourhood', 'month']).groupby(['neighbourhood', 'month'])['List_month'].sum().shift(1), 
             on=['neighbourhood', 'month'], rsuffix='_lag_byneigh')

listings_df.loc[:,'List_month_lag_byneigh'] =  listings_df['List_month_lag_byneigh'].mask(listings_df['month'] == 3, np.nan)

listings_df.sort_values(by=['neighbourhood', 'month'])[['id', 'month', 'neighbourhood', 'List_month_byneigh', 'List_month_lag_byneigh']]

Unnamed: 0,id,month,neighbourhood,List_month_byneigh,List_month_lag_byneigh
11359,681972,0,Alameda,15,
19388,1125125,0,Alameda,15,
19425,1128026,0,Alameda,15,
22496,1268272,0,Alameda,15,
24827,1420478,0,Alameda,15,
...,...,...,...,...,...
406518,29884092,36,Woodstock,93,90.0
407628,30010653,36,Woodstock,93,90.0
408294,30085303,36,Woodstock,93,90.0
413104,31043503,36,Woodstock,93,90.0


# 6. Creating drop criteria for observations

In [41]:
# Create a 0-valued drop indicator. This will be replaced with 1 
# whenever certain conditions are satisfied

listings_df.loc[:, 'drop_indicator'] = 0

## Drop criteria 1: Property *never* earns a review

In [42]:
# Find the maximum difference in reviews by property
max_NORdiff = listings_df.groupby('id')['NOR_diff'].max()
listings_df.loc[:, 'max_NORdiff'] = max_NORdiff[listings_df['id']].values

no_revs_ind = (listings_df['max_NORdiff'] == 0).values*1
listings_df.loc[:, 'drop_indicator'] = listings_df['drop_indicator'].replace({ 0: no_revs_ind})

##  Drop criteria 2: Property price is below 0.1 percentile or above 99.9 percentile

In [43]:
price_01per = listings_df.price.quantile(.001)
price_999per = listings_df.price.quantile(.999)

low_price = (listings_df.groupby('id')['price'].min()[listings_df['id']].values < price_01per)*1
high_price = (listings_df.groupby('id')['price'].min()[listings_df['id']].values > price_999per)*1

listings_df.loc[:, 'drop_indicator'] = listings_df['drop_indicator'].replace({ 0: low_price})
listings_df.loc[:, 'drop_indicator'] = listings_df['drop_indicator'].replace({ 0: high_price})

##  Drop criteria 3: Property *never* lists a day of availability

In [44]:
never_avail = (listings_df.groupby('id')['availability_365'].max()[listings_df['id']].values == 0)*1
listings_df.loc[:, 'drop_indicator'] = listings_df['drop_indicator'].replace({ 0: never_avail})

## Drop criteria 4: Minimum nights is 30 days or more (no longer a short-term rental)

In [45]:
long_term_rental = (listings_df.groupby(['id'])['minimum_nights'].min()[listings_df['id']].values >= 30)*1
listings_df.loc[:, 'drop_indicator'] = listings_df['drop_indicator'].replace({ 0: long_term_rental})

##  Drop criteria 5: Hotel indicator

In [46]:
hotel_ind = (listings_df.groupby(['id'])['hotel_dum'].max()[listings_df['id']].values == 1)*1
listings_df.loc[:, 'drop_indicator'] = listings_df['drop_indicator'].replace({ 0: hotel_ind})

## Drop counts

In [47]:
print("Unique properties flagged for drop:")
print(listings_df.groupby('id')['drop_indicator'].max().values.sum())

print("Unique properties not flagged for drop:")
print((1 - listings_df.groupby('id')['drop_indicator'].max().values).sum())

Unique properties flagged for drop:
2792
Unique properties not flagged for drop:
8515


-------
# -- Save to compressed csv --

In [48]:
os.chdir(csv_save_path)
listings_df.to_csv(city_abbrev + '_1stStageClean.csv.gz', compression='gzip', index=False, date_format='%Y-%m-%d %H:%M:%S')