# Intro to pandas

## Sacramento Real Estate Transactions dataset cleaning

In [3]:
import pandas as pd

## Steps

1. Load the csv file
2. Run some transformations
    - Title city
    - Convert numerical columns to `int` and `float`
    - Convert `sq__ft` to `sq_m`
    - Convert `sale_date` to format 'YYYY-MM-DD'
    
    
**1. Load data**

In [5]:
type(pd.read_csv("data/sacramento_data.csv"))

pandas.core.frame.DataFrame

In [8]:
df = pd.read_csv("data/sacramento_data.csv")

In [23]:
df.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [24]:
df["street"]

0             3526 HIGH ST
1              51 OMAHA CT
2           2796 BRANCH ST
3         2805 JANETTE WAY
4          6001 MCMAHON DR
              ...         
980     9169 GARLINGTON CT
981        6932 RUSKUT WAY
982      7933 DAFFODIL WAY
983       8304 RED FOX WAY
984    3882 YELLOWSTONE LN
Name: street, Length: 985, dtype: object

In [27]:
type(df["street"])

pandas.core.series.Series

In [28]:
df["street"].head()

0        3526 HIGH ST
1         51 OMAHA CT
2      2796 BRANCH ST
3    2805 JANETTE WAY
4     6001 MCMAHON DR
Name: street, dtype: object

In [29]:
df[["street"]]

Unnamed: 0,street
0,3526 HIGH ST
1,51 OMAHA CT
2,2796 BRANCH ST
3,2805 JANETTE WAY
4,6001 MCMAHON DR
...,...
980,9169 GARLINGTON CT
981,6932 RUSKUT WAY
982,7933 DAFFODIL WAY
983,8304 RED FOX WAY


In [30]:
type(df[["street"]])

pandas.core.frame.DataFrame

**Start the cleaning process**

Create a copy first

In [31]:
df_clean = df.copy()

**1. Rename `sq__ft` to `sq_m`**

In [33]:
df_clean.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [34]:
type(df_clean.columns)

pandas.core.indexes.base.Index

In [35]:
list(df_clean.columns)

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq__ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [37]:
['sq_m' if col == 'sq__ft' else col for col in df_clean.columns]

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq_m',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [38]:
df_clean.columns = ['sq_m' if col == 'sq__ft' else col for col in df_clean.columns]

In [39]:
df_clean.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq_m,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


Alternatively:

In [41]:
# reset
df_clean = df.copy()

In [42]:
df_clean.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [48]:
df_clean.rename(columns={"sq__ft": "sq_m"})

Unnamed: 0,street,city,zip,state,beds,baths,sq_m,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.519470,-121.435768
...,...,...,...,...,...,...,...,...,...,...,...,...
980,9169 GARLINGTON CT,SACRAMENTO,95829,CA,4,3,2280,Residential,Thu May 15 00:00:00 EDT 2008,232425,38.457679,-121.359620
981,6932 RUSKUT WAY,SACRAMENTO,95823,CA,3,2,1477,Residential,Thu May 15 00:00:00 EDT 2008,234000,38.499893,-121.458890
982,7933 DAFFODIL WAY,CITRUS HEIGHTS,95610,CA,3,2,1216,Residential,Thu May 15 00:00:00 EDT 2008,235000,38.708824,-121.256803
983,8304 RED FOX WAY,ELK GROVE,95758,CA,4,2,1685,Residential,Thu May 15 00:00:00 EDT 2008,235301,38.417000,-121.397424


In [49]:
df_clean

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.519470,-121.435768
...,...,...,...,...,...,...,...,...,...,...,...,...
980,9169 GARLINGTON CT,SACRAMENTO,95829,CA,4,3,2280,Residential,Thu May 15 00:00:00 EDT 2008,232425,38.457679,-121.359620
981,6932 RUSKUT WAY,SACRAMENTO,95823,CA,3,2,1477,Residential,Thu May 15 00:00:00 EDT 2008,234000,38.499893,-121.458890
982,7933 DAFFODIL WAY,CITRUS HEIGHTS,95610,CA,3,2,1216,Residential,Thu May 15 00:00:00 EDT 2008,235000,38.708824,-121.256803
983,8304 RED FOX WAY,ELK GROVE,95758,CA,4,2,1685,Residential,Thu May 15 00:00:00 EDT 2008,235301,38.417000,-121.397424


In [50]:
df_clean = df_clean.rename(columns={"sq__ft": "sq_m"})

In [51]:
df_clean.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq_m', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [52]:
a = [1,2,3]

In [53]:
a.pop()

3

In [54]:
a

[1, 2]

calling the .pop() method on a list is an in-place operation.

In [55]:
df_clean = df.copy()

In [61]:
df_clean.rename(columns={"sq__ft": "sq_m"}, inplace=True)

In [60]:
df_clean

Unnamed: 0,street,city,zip,state,beds,baths,sq_m,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.519470,-121.435768
...,...,...,...,...,...,...,...,...,...,...,...,...
980,9169 GARLINGTON CT,SACRAMENTO,95829,CA,4,3,2280,Residential,Thu May 15 00:00:00 EDT 2008,232425,38.457679,-121.359620
981,6932 RUSKUT WAY,SACRAMENTO,95823,CA,3,2,1477,Residential,Thu May 15 00:00:00 EDT 2008,234000,38.499893,-121.458890
982,7933 DAFFODIL WAY,CITRUS HEIGHTS,95610,CA,3,2,1216,Residential,Thu May 15 00:00:00 EDT 2008,235000,38.708824,-121.256803
983,8304 RED FOX WAY,ELK GROVE,95758,CA,4,2,1685,Residential,Thu May 15 00:00:00 EDT 2008,235301,38.417000,-121.397424


---

### Title `city` column

In [66]:
df_clean["city"].str.title()

0           Sacramento
1           Sacramento
2           Sacramento
3           Sacramento
4           Sacramento
            ...       
980         Sacramento
981         Sacramento
982     Citrus Heights
983          Elk Grove
984    El Dorado Hills
Name: city, Length: 985, dtype: object

title the city column using a vectorized operation.

In [72]:
df_clean["city"] = df_clean["city"].str.title()

In [74]:
df_clean.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq_m,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,Sacramento,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,Sacramento,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,Sacramento,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,Sacramento,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,Sacramento,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


### Convert data types

In [76]:
df_clean.dtypes

street        object
city          object
zip            int64
state         object
beds           int64
baths          int64
sq_m           int64
type          object
sale_date     object
price          int64
latitude     float64
longitude    float64
dtype: object

In [77]:
type(df_clean.dtypes)

pandas.core.series.Series

Pandas automatically inferred the correct data types for all columns.

However, if I wanted to change it, how would I do it?

In [87]:
df_clean["latitude"].astype(int)

0      38
1      38
2      38
3      38
4      38
       ..
980    38
981    38
982    38
983    38
984    38
Name: latitude, Length: 985, dtype: int64

### Convert sale_date to format 'YYYY-MM-DD'

In [91]:
df_clean["sale_date"]

0      Wed May 21 00:00:00 EDT 2008
1      Wed May 21 00:00:00 EDT 2008
2      Wed May 21 00:00:00 EDT 2008
3      Wed May 21 00:00:00 EDT 2008
4      Wed May 21 00:00:00 EDT 2008
                   ...             
980    Thu May 15 00:00:00 EDT 2008
981    Thu May 15 00:00:00 EDT 2008
982    Thu May 15 00:00:00 EDT 2008
983    Thu May 15 00:00:00 EDT 2008
984    Thu May 15 00:00:00 EDT 2008
Name: sale_date, Length: 985, dtype: object

In [93]:
pd.to_datetime(df_clean["sale_date"])

0     2008-05-21
1     2008-05-21
2     2008-05-21
3     2008-05-21
4     2008-05-21
         ...    
980   2008-05-15
981   2008-05-15
982   2008-05-15
983   2008-05-15
984   2008-05-15
Name: sale_date, Length: 985, dtype: datetime64[ns]


In [94]:
pd.to_datetime(df_clean["sale_date"])[0]

Timestamp('2008-05-21 00:00:00')

In [96]:
type(pd.to_datetime(df_clean["sale_date"])[0])

pandas._libs.tslibs.timestamps.Timestamp

In [98]:
df_clean["sale_date"] = pd.to_datetime(df_clean["sale_date"])



In [100]:
df_clean.dtypes

street               object
city                 object
zip                   int64
state                object
beds                  int64
baths                 int64
sq_m                  int64
type                 object
sale_date    datetime64[ns]
price                 int64
latitude            float64
longitude           float64
dtype: object

### Convert sq_ft to sq_m

In [104]:
df_clean["sq_m"] / 10.764

0       77.666295
1      108.416945
2       73.950204
3       79.152731
4       74.043107
          ...    
980    211.817168
981    137.216648
982    112.969156
983    156.540320
984    126.532887
Name: sq_m, Length: 985, dtype: float64

In [105]:
df_clean["sq_m"] = df_clean["sq_m"] / 10.764

In [110]:
df_clean["sq_m"] = df_clean["sq_m"].round(2)

In [115]:
df_clean.to_csv("data/sacramento_clean.csv", index=False)