In [1]:
import pandas as pd

### Sacramento pandas
1. Load the csv
2. run some transformations
    - Capitalize 'city'
    - convert to int:
        - 'zip'
        - 'beds'
        - 'baths'
        - 'sq__ft'
        - 'price'
    - convert to float:
        - 'latitude'
        - 'longitude'
    - rename 'sq__ft' to'sq_ft'
    - convert 'sale_Date' into format: "YYYY-MM-DD"

In [2]:
df = pd.read_csv("Sacramentorealestatetransactions.csv")

In [3]:
df.head(5)

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [6]:
type(df['street'])

pandas.core.series.Series

I'm a Series!

In [7]:
df['street'].head(5)

0        3526 HIGH ST
1         51 OMAHA CT
2      2796 BRANCH ST
3    2805 JANETTE WAY
4     6001 MCMAHON DR
Name: street, dtype: object

In [8]:
type(df[['street']])

pandas.core.frame.DataFrame

In [9]:
df[['street']].head(5)

Unnamed: 0,street
0,3526 HIGH ST
1,51 OMAHA CT
2,2796 BRANCH ST
3,2805 JANETTE WAY
4,6001 MCMAHON DR


In [10]:
# Let me create a copy that I can mess up
df_clean = df.copy()

### 1: Replace "" in 'sqft' with "_"

In [11]:
df_clean.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [12]:
list(df_clean.columns)

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq__ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [13]:
type(df_clean.columns)

pandas.core.indexes.base.Index

In [14]:
for i in df_clean.columns:
    print(i)

street
city
zip
state
beds
baths
sq__ft
type
sale_date
price
latitude
longitude


In [15]:
df_clean.columns = [col.replace("__", "_") for col in list(df_clean.columns)]

In [16]:
df_clean.head(5)

Unnamed: 0,street,city,zip,state,beds,baths,sq_ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


### 2. Capitalize the 'city' column

In [17]:
df_clean['city'] = df['city'].str.capitalize()

In [18]:
df_clean.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq_ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,Sacramento,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,Sacramento,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,Sacramento,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,Sacramento,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,Sacramento,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


### 3. Converts column types

In [19]:
df_clean.dtypes

street        object
city          object
zip            int64
state         object
beds           int64
baths          int64
sq_ft          int64
type          object
sale_date     object
price          int64
latitude     float64
longitude    float64
dtype: object

##### Pandas already inferred the right types when using pd.read_csv().
##### How would I change it if I wanted to?

In [20]:
df_clean['latitude'].astype(int)

0      38
1      38
2      38
3      38
4      38
       ..
980    38
981    38
982    38
983    38
984    38
Name: latitude, Length: 985, dtype: int64

### 4. Convert weird date string into 'YYYY-MM-DD' representation

In [23]:
df_clean["sale_date"] = pd.to_datetime(df_clean["sale_date"], infer_datetime_format = True)

In [24]:
df_clean.to_pickle("clean_sacramento.pkl")
df_clean.to_csv("clean_sacramento.csv")

### General pandas stuff, plus a little bit a numpy

In [25]:
import numpy as np

In [26]:
np.random.random_sample((5,))

array([0.19631511, 0.16732279, 0.53471967, 0.81024961, 0.97323313])

In [27]:
np.random.random_sample((3, 2))

array([[0.51088162, 0.75615828],
       [0.7265686 , 0.13379729],
       [0.31584311, 0.71959571]])

In [28]:
col_names = ["banana_density", "banana_mass", "apple_density", "apple_mass"]
df = pd.DataFrame(np.random.random_sample((10, 4)), columns = col_names)

In [29]:
df

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass
0,0.785977,0.667365,0.746387,0.777638
1,0.232206,0.506532,0.064825,0.040395
2,0.889128,0.020756,0.63502,0.086881
3,0.63316,0.256072,0.556503,0.52187
4,0.738725,0.852042,0.946617,0.533175
5,0.52431,0.085193,0.071486,0.926304
6,0.454528,0.391825,0.35808,0.341557
7,0.507302,0.515186,0.838678,0.479325
8,0.927702,0.100503,0.519008,0.173575
9,0.860125,0.91674,0.688232,0.430108


### Subsetting
##### 1. Columns

In [30]:
apples = ["apple_density", "apple_mass", "apple_mass"]
df[apples]

Unnamed: 0,apple_density,apple_mass,apple_mass.1
0,0.746387,0.777638,0.777638
1,0.064825,0.040395,0.040395
2,0.63502,0.086881,0.086881
3,0.556503,0.52187,0.52187
4,0.946617,0.533175,0.533175
5,0.071486,0.926304,0.926304
6,0.35808,0.341557,0.341557
7,0.838678,0.479325,0.479325
8,0.519008,0.173575,0.173575
9,0.688232,0.430108,0.430108


In [31]:
df[["apple_density", "apple_mass", "apple_mass"]]

Unnamed: 0,apple_density,apple_mass,apple_mass.1
0,0.746387,0.777638,0.777638
1,0.064825,0.040395,0.040395
2,0.63502,0.086881,0.086881
3,0.556503,0.52187,0.52187
4,0.946617,0.533175,0.533175
5,0.071486,0.926304,0.926304
6,0.35808,0.341557,0.341557
7,0.838678,0.479325,0.479325
8,0.519008,0.173575,0.173575
9,0.688232,0.430108,0.430108


In [32]:
# Sorting

df.sort_values("banana_density", ascending = False)

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass
8,0.927702,0.100503,0.519008,0.173575
2,0.889128,0.020756,0.63502,0.086881
9,0.860125,0.91674,0.688232,0.430108
0,0.785977,0.667365,0.746387,0.777638
4,0.738725,0.852042,0.946617,0.533175
3,0.63316,0.256072,0.556503,0.52187
5,0.52431,0.085193,0.071486,0.926304
7,0.507302,0.515186,0.838678,0.479325
6,0.454528,0.391825,0.35808,0.341557
1,0.232206,0.506532,0.064825,0.040395


In [34]:
some_new_column = np.random.random_sample((10,))
some_new_column

array([0.77562064, 0.84419233, 0.55763352, 0.39798726, 0.46243954,
       0.82707826, 0.54558493, 0.40242167, 0.05692707, 0.36291078])

In [35]:
# Adding a column

df["mango_mass"] = some_new_column

In [36]:
pd.Series(some_new_column)

0    0.775621
1    0.844192
2    0.557634
3    0.397987
4    0.462440
5    0.827078
6    0.545585
7    0.402422
8    0.056927
9    0.362911
dtype: float64

In [37]:
df

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.785977,0.667365,0.746387,0.777638,0.775621
1,0.232206,0.506532,0.064825,0.040395,0.844192
2,0.889128,0.020756,0.63502,0.086881,0.557634
3,0.63316,0.256072,0.556503,0.52187,0.397987
4,0.738725,0.852042,0.946617,0.533175,0.46244
5,0.52431,0.085193,0.071486,0.926304,0.827078
6,0.454528,0.391825,0.35808,0.341557,0.545585
7,0.507302,0.515186,0.838678,0.479325,0.402422
8,0.927702,0.100503,0.519008,0.173575,0.056927
9,0.860125,0.91674,0.688232,0.430108,0.362911


In [39]:
df_new = df.sort_values("banana_density", ascending = False)

In [40]:
df_new["mango_mass"] = pd.Series(some_new_column)

In [41]:
df_new

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
8,0.927702,0.100503,0.519008,0.173575,0.056927
2,0.889128,0.020756,0.63502,0.086881,0.557634
9,0.860125,0.91674,0.688232,0.430108,0.362911
0,0.785977,0.667365,0.746387,0.777638,0.775621
4,0.738725,0.852042,0.946617,0.533175,0.46244
3,0.63316,0.256072,0.556503,0.52187,0.397987
5,0.52431,0.085193,0.071486,0.926304,0.827078
7,0.507302,0.515186,0.838678,0.479325,0.402422
6,0.454528,0.391825,0.35808,0.341557,0.545585
1,0.232206,0.506532,0.064825,0.040395,0.844192


In [44]:
df_new = df_new.reset_index(drop = True)

In [45]:
df_new

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.927702,0.100503,0.519008,0.173575,0.056927
1,0.889128,0.020756,0.63502,0.086881,0.557634
2,0.860125,0.91674,0.688232,0.430108,0.362911
3,0.785977,0.667365,0.746387,0.777638,0.775621
4,0.738725,0.852042,0.946617,0.533175,0.46244
5,0.63316,0.256072,0.556503,0.52187,0.397987
6,0.52431,0.085193,0.071486,0.926304,0.827078
7,0.507302,0.515186,0.838678,0.479325,0.402422
8,0.454528,0.391825,0.35808,0.341557,0.545585
9,0.232206,0.506532,0.064825,0.040395,0.844192


In [46]:
df_super_new = (df_new
                .sort_values("mango_mass")
                .reset_index(drop = True))

In [47]:
df_super_new

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.927702,0.100503,0.519008,0.173575,0.056927
1,0.860125,0.91674,0.688232,0.430108,0.362911
2,0.63316,0.256072,0.556503,0.52187,0.397987
3,0.507302,0.515186,0.838678,0.479325,0.402422
4,0.738725,0.852042,0.946617,0.533175,0.46244
5,0.454528,0.391825,0.35808,0.341557,0.545585
6,0.889128,0.020756,0.63502,0.086881,0.557634
7,0.785977,0.667365,0.746387,0.777638,0.775621
8,0.52431,0.085193,0.071486,0.926304,0.827078
9,0.232206,0.506532,0.064825,0.040395,0.844192


In [48]:
df_super_new["banana_mass"] < 0.5

0     True
1    False
2     True
3    False
4    False
5     True
6     True
7    False
8     True
9    False
Name: banana_mass, dtype: bool

In [49]:
df_super_new[df_super_new["banana_mass"] < 0.5]

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.927702,0.100503,0.519008,0.173575,0.056927
2,0.63316,0.256072,0.556503,0.52187,0.397987
5,0.454528,0.391825,0.35808,0.341557,0.545585
6,0.889128,0.020756,0.63502,0.086881,0.557634
8,0.52431,0.085193,0.071486,0.926304,0.827078


In [50]:
df_super_new[[True, False, False, True, False, False, True, False, False, True]]

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.927702,0.100503,0.519008,0.173575,0.056927
3,0.507302,0.515186,0.838678,0.479325,0.402422
6,0.889128,0.020756,0.63502,0.086881,0.557634
9,0.232206,0.506532,0.064825,0.040395,0.844192


In [53]:
df_super_new[(df_super_new["mango_mass"] > .1) & (df_super_new["apple_density"] < .9)][["banana_mass", "apple_density"]]

Unnamed: 0,banana_mass,apple_density
1,0.91674,0.688232
2,0.256072,0.556503
3,0.515186,0.838678
5,0.391825,0.35808
6,0.020756,0.63502
7,0.667365,0.746387
8,0.085193,0.071486
9,0.506532,0.064825
