In [2]:
import pandas as pd
import numpy as np

# Pandas intro

Looking at Sacramento Real Estate data

Tasks:
- check the data types
- check the column names
- Rename "sq__ft" to "sq_ft"
- Capitalize "city"
- Turn the sale date into "YYYY-MM-DD" format

In [4]:
df = pd.read_csv("data/Sacramentorealestatetransactions.csv")

In [5]:
df.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [8]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df["street"]

0             3526 HIGH ST
1              51 OMAHA CT
2           2796 BRANCH ST
3         2805 JANETTE WAY
4          6001 MCMAHON DR
              ...         
980     9169 GARLINGTON CT
981        6932 RUSKUT WAY
982      7933 DAFFODIL WAY
983       8304 RED FOX WAY
984    3882 YELLOWSTONE LN
Name: street, Length: 985, dtype: object

In [7]:
type(df["street"])

pandas.core.series.Series

In [9]:
df[["street"]]

Unnamed: 0,street
0,3526 HIGH ST
1,51 OMAHA CT
2,2796 BRANCH ST
3,2805 JANETTE WAY
4,6001 MCMAHON DR
...,...
980,9169 GARLINGTON CT
981,6932 RUSKUT WAY
982,7933 DAFFODIL WAY
983,8304 RED FOX WAY


In [10]:
df.street

0             3526 HIGH ST
1              51 OMAHA CT
2           2796 BRANCH ST
3         2805 JANETTE WAY
4          6001 MCMAHON DR
              ...         
980     9169 GARLINGTON CT
981        6932 RUSKUT WAY
982      7933 DAFFODIL WAY
983       8304 RED FOX WAY
984    3882 YELLOWSTONE LN
Name: street, Length: 985, dtype: object

In [11]:
# don't recommend using df.street because of white space
# us df["street"] instead

# Data Cleaning

In [12]:
df_copy = df.copy()

In [13]:
#df_copy = df -> don't do, this is referencing

In [14]:
df_copy.dtypes

street        object
city          object
zip            int64
state         object
beds           int64
baths          int64
sq__ft         int64
type          object
sale_date     object
price          int64
latitude     float64
longitude    float64
dtype: object

In [16]:
df_copy.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [17]:
df_copy.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [18]:
type(df_copy.columns)

pandas.core.indexes.base.Index

In [19]:
list(df_copy.columns)

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq__ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [20]:
[col for col in df_copy.columns]

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq__ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [21]:
type([col for col in df_copy.columns][0])

str

In [22]:
[col.replace("__", "_") for col in df_copy.columns]

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq_ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [23]:
df_copy.columns = [col.replace("__", "_") for col in df_copy.columns]

In [24]:
df_copy.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq_ft', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [25]:
len(df_copy.columns)

12

## Capitalize "city"

In [28]:
type(df_copy["city"][0])

str

In [29]:
df_copy["city"]

0           SACRAMENTO
1           SACRAMENTO
2           SACRAMENTO
3           SACRAMENTO
4           SACRAMENTO
            ...       
980         SACRAMENTO
981         SACRAMENTO
982     CITRUS HEIGHTS
983          ELK GROVE
984    EL DORADO HILLS
Name: city, Length: 985, dtype: object

In [30]:
df_copy["city"].str.title()

0           Sacramento
1           Sacramento
2           Sacramento
3           Sacramento
4           Sacramento
            ...       
980         Sacramento
981         Sacramento
982     Citrus Heights
983          Elk Grove
984    El Dorado Hills
Name: city, Length: 985, dtype: object

In [31]:
df_copy["city"] = df_copy["city"].str.title()

In [32]:
df_copy["city"]

0           Sacramento
1           Sacramento
2           Sacramento
3           Sacramento
4           Sacramento
            ...       
980         Sacramento
981         Sacramento
982     Citrus Heights
983          Elk Grove
984    El Dorado Hills
Name: city, Length: 985, dtype: object

## Turn 'sale_date' into a 'YYYY-MM-DD' format

In [33]:
df_copy["sale_date"]

0      Wed May 21 00:00:00 EDT 2008
1      Wed May 21 00:00:00 EDT 2008
2      Wed May 21 00:00:00 EDT 2008
3      Wed May 21 00:00:00 EDT 2008
4      Wed May 21 00:00:00 EDT 2008
                   ...             
980    Thu May 15 00:00:00 EDT 2008
981    Thu May 15 00:00:00 EDT 2008
982    Thu May 15 00:00:00 EDT 2008
983    Thu May 15 00:00:00 EDT 2008
984    Thu May 15 00:00:00 EDT 2008
Name: sale_date, Length: 985, dtype: object

In [37]:
df_copy["sale_date"] = pd.to_datetime(df_copy["sale_date"], utc=True).dt.strftime("%Y-%m-%d")

In [38]:
df_copy.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq_ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,Sacramento,95838,CA,2,1,836,Residential,2008-05-21,59222,38.631913,-121.434879
1,51 OMAHA CT,Sacramento,95823,CA,3,1,1167,Residential,2008-05-21,68212,38.478902,-121.431028
2,2796 BRANCH ST,Sacramento,95815,CA,2,1,796,Residential,2008-05-21,68880,38.618305,-121.443839
3,2805 JANETTE WAY,Sacramento,95815,CA,2,1,852,Residential,2008-05-21,69307,38.616835,-121.439146
4,6001 MCMAHON DR,Sacramento,95824,CA,2,1,797,Residential,2008-05-21,81900,38.51947,-121.435768


Recap

In [39]:
# read data
df_1 = pd.read_csv("data/Sacramentorealestatetransactions.csv")

In [40]:
#remove double underscore
df_1.columns = [col.replace("__", "_") for col in df_1.columns]

In [41]:
#title case the city
df_1["city"] = df_1["city"].str.title()

In [48]:
#get date format we are looking for
df_1["sale_date"] = pd.to_datetime(df_1["sale_date"], utc=True).dt.strftime("%Y-%m-%d")

In [45]:
#df_1.to_csv("data/clean_sacramento.csv", index=False)

In [46]:
#df_1.to_pickle("data/clean_sacramento.pkl")