### Redfin Data Center

Information about housing market is extracted from https://www.redfin.com/news/data-center/

### We extract the data from our S3 bucket

In [1]:
import pandas as pd
import os

In [2]:
from private.s3_aws import access_key, secret_access_key

In [3]:
data= pd.read_csv('/Users/juliacaro/Documents/SoyHenry/GRUPO_07-PF_DS4/airflow-docker/datasets/raw_data/city_market_tracker.tsv',sep='\t')

In [None]:
data = pd.read_csv(f"s3://rawdatagrupo07/city_market_tracker.tsv",
    storage_options={
        "key": access_key,
        "secret": secret_access_key
    },sep='\t'
)

Data and metric definitions: https://www.redfin.com/news/data-center-metrics-definitions/#:~:text=Percent%20off%20market%20in%20two,an%20the%20lastest%20listing%20price.

In [4]:
data.head()

Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated
0,2012-02-01,2012-02-29,30,place,6,12812,f,"Neptune City, NJ",Neptune City,New Jersey,...,-0.147619,,,,0.0,0.0,0.0,"New Brunswick, NJ",35154,2022-10-16 14:38:47
1,2015-11-01,2015-11-30,30,place,6,17415,f,"St. Marys, GA",St. Marys,Georgia,...,0.0,,,,0.0,0.0,0.0,"St. Marys, GA",41220,2022-10-16 14:38:47
2,2022-09-01,2022-09-30,30,place,6,15302,f,"Salem, MA",Salem,Massachusetts,...,0.066496,0.266667,-0.09697,-0.075439,0.285714,0.022556,-0.082707,"Boston, MA",14454,2022-10-16 14:38:47
3,2019-09-01,2019-09-30,30,place,6,10370,f,"Lakeside, OR",Lakeside,Oregon,...,0.0,0.4,0.066667,0.092308,0.0,-0.5,0.0,"Coos Bay, OR",18300,2022-10-16 14:38:47
4,2022-07-01,2022-07-31,30,place,6,20239,f,"Waterville, ME",Waterville,Maine,...,-0.5,0.444444,0.28655,0.256944,0.25,-0.083333,-0.178571,"Augusta, ME",12300,2022-10-16 14:38:47


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4374893 entries, 0 to 4374892
Data columns (total 58 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   period_begin                    object 
 1   period_end                      object 
 2   period_duration                 int64  
 3   region_type                     object 
 4   region_type_id                  int64  
 5   table_id                        int64  
 6   is_seasonally_adjusted          object 
 7   region                          object 
 8   city                            object 
 9   state                           object 
 10  state_code                      object 
 11  property_type                   object 
 12  property_type_id                int64  
 13  median_sale_price               float64
 14  median_sale_price_mom           float64
 15  median_sale_price_yoy           float64
 16  median_list_price               float64
 17  median_list_price_mom      

In [47]:
data.columns

Index(['period_begin', 'period_end', 'period_duration', 'region_type',
       'region_type_id', 'table_id', 'is_seasonally_adjusted', 'region',
       'city', 'state', 'state_code', 'property_type', 'property_type_id',
       'median_sale_price', 'median_sale_price_mom', 'median_sale_price_yoy',
       'median_list_price', 'median_list_price_mom', 'median_list_price_yoy',
       'median_ppsf', 'median_ppsf_mom', 'median_ppsf_yoy', 'median_list_ppsf',
       'median_list_ppsf_mom', 'median_list_ppsf_yoy', 'homes_sold',
       'homes_sold_mom', 'homes_sold_yoy', 'pending_sales',
       'pending_sales_mom', 'pending_sales_yoy', 'new_listings',
       'new_listings_mom', 'new_listings_yoy', 'inventory', 'inventory_mom',
       'inventory_yoy', 'months_of_supply', 'months_of_supply_mom',
       'months_of_supply_yoy', 'median_dom', 'median_dom_mom',
       'median_dom_yoy', 'avg_sale_to_list', 'avg_sale_to_list_mom',
       'avg_sale_to_list_yoy', 'sold_above_list', 'sold_above_list_mom',
 

In [6]:
data.period_begin = pd.to_datetime(data.period_begin)
data.period_end = pd.to_datetime(data.period_end)

In [7]:
data = data.rename(columns = {'period_begin':'PeriodBegin','period_end':'PeriodEnd','city':'City','state_code':'State','homes_sold':'HomesSold','homes_sold_mom':'HomesSold_mom', 'homes_sold_yoy':'HomesSold_yoy','inventory':'Inventory','inventory_mom':'Inventory_mom','inventory_yoy':'Inventory_yoy'})

We generate a table with the total of sales and the inventory during a given period and their differences year over year and month over month.

In [9]:
homes_sold_total = data[['PeriodBegin','PeriodEnd','City', 'State','HomesSold','HomesSold_mom', 'HomesSold_yoy','Inventory', 'Inventory_mom','Inventory_yoy']]

The dataset ***cities.csv*** has the city ids. This data is merged with the previous data to get the corresponding ids for each city.

In [10]:
cities = pd.read_csv(f"s3://cleandatagrupo07/cities.csv",
        storage_options={
            "key": access_key,
            "secret": secret_access_key
        }
    )


In [11]:
homes_sold_total_id = pd.merge(homes_sold_total, cities, how = 'inner', on=['State','City'])

#### Homes sold by month

In [25]:
homes_sold_by_month = homes_sold_total_id.copy()

In [26]:
homes_sold_by_month['Year'] = homes_sold_by_month.PeriodBegin.dt.year

In [27]:
homes_sold_by_month['Month'] = homes_sold_by_month.PeriodBegin.dt.month

In [28]:
homes_sold_by_month = homes_sold_by_month.groupby(['Unique_City_ID','Year','Month'],as_index=False).mean(numeric_only=True)

In [29]:
homes_sold_by_month.fillna(0,inplace=True)

In [30]:
homes_sold_by_month.rename(columns={'Unique_City_ID':'City'},inplace=True)

In [31]:
homes_sold_by_month.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1221967 entries, 0 to 1221966
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   City           1221967 non-null  object 
 1   Year           1221967 non-null  int64  
 2   Month          1221967 non-null  int64  
 3   HomesSold      1221967 non-null  float64
 4   HomesSold_mom  1221967 non-null  float64
 5   HomesSold_yoy  1221967 non-null  float64
 6   Inventory      1221967 non-null  float64
 7   Inventory_mom  1221967 non-null  float64
 8   Inventory_yoy  1221967 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 83.9+ MB


In [12]:
homes_sold_total_id.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3335673 entries, 0 to 3335672
Data columns (total 12 columns):
 #   Column          Dtype         
---  ------          -----         
 0   PeriodBegin     datetime64[ns]
 1   PeriodEnd       datetime64[ns]
 2   City            object        
 3   State           object        
 4   HomesSold       float64       
 5   HomesSold_mom   float64       
 6   HomesSold_yoy   float64       
 7   Inventory       float64       
 8   Inventory_mom   float64       
 9   Inventory_yoy   float64       
 10  Unique_City_ID  object        
 11  County          object        
dtypes: datetime64[ns](2), float64(6), object(4)
memory usage: 330.8+ MB


In [13]:
homes_sold_total_id.columns

Index(['PeriodBegin', 'PeriodEnd', 'City', 'State', 'HomesSold',
       'HomesSold_mom', 'HomesSold_yoy', 'Inventory', 'Inventory_mom',
       'Inventory_yoy', 'Unique_City_ID', 'County'],
      dtype='object')

We filter the dataset just to keep information of 2022.

In [55]:
homes_sold_total_2022 = homes_sold_total_id[homes_sold_total_id['PeriodBegin'].dt.year==2022]

In [56]:
homes_sold_total_2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264239 entries, 6 to 3335672
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   PeriodBegin     264239 non-null  datetime64[ns]
 1   PeriodEnd       264239 non-null  datetime64[ns]
 2   City            264239 non-null  object        
 3   State           264239 non-null  object        
 4   HomesSold       263804 non-null  float64       
 5   HomesSold_mom   223887 non-null  float64       
 6   HomesSold_yoy   223333 non-null  float64       
 7   Inventory       237641 non-null  float64       
 8   Inventory_mom   207924 non-null  float64       
 9   Inventory_yoy   204949 non-null  float64       
 10  Unique_City_ID  264239 non-null  object        
 11  County          264239 non-null  object        
dtypes: datetime64[ns](2), float64(6), object(4)
memory usage: 26.2+ MB


In [57]:
homes_sold_total_2022.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homes_sold_total_2022.dropna(inplace=True)


We generate a table with the percent of listings that dropped their price in a given time period in 2022, the listing price for the given month and the differences year over year and month over month.

In [11]:
data = data.rename(columns = {'price_drops':'PriceDrops','price_drops_mom':'PriceDrops_mom','price_drops_yoy':'PriceDrops_yoy','median_list_price':'MedianListPrice'})

In [12]:
price_drops = data[['PeriodBegin','PeriodEnd','City', 'State','PriceDrops', 'PriceDrops_mom','PriceDrops_yoy','MedianListPrice']]


In [13]:
price_drops_2022 = price_drops[price_drops.PeriodBegin.dt.year==2022]

In [14]:
price_drops_id_2022 = pd.merge(price_drops_2022, cities, how = 'inner', on=['State','City'])

In [15]:
price_drops_id_2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264239 entries, 0 to 264238
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   PeriodBegin      264239 non-null  datetime64[ns]
 1   PeriodEnd        264239 non-null  datetime64[ns]
 2   City             264239 non-null  object        
 3   State            264239 non-null  object        
 4   PriceDrops       168866 non-null  float64       
 5   PriceDrops_mom   135313 non-null  float64       
 6   PriceDrops_yoy   122596 non-null  float64       
 7   MedianListPrice  229924 non-null  float64       
 8   Unique_City_ID   264239 non-null  object        
 9   County           264239 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(4)
memory usage: 22.2+ MB


In [16]:
price_drops_id_2022.dropna(inplace=True)

In [17]:
price_drops_id_2022.PriceDrops = price_drops_id_2022.PriceDrops*100
price_drops_id_2022.PriceDrops_mom = price_drops_id_2022.PriceDrops_mom*100
price_drops_id_2022.PriceDrops_yoy = price_drops_id_2022.PriceDrops_yoy*100