# Intro to Pandas

In [1]:
import pandas as pd

# Sacramento pandas

1. Load the csv
2. Run some transformations

    - Capitalize 'city'
    - convert to int:
        - 'zip'
        - 'beds'
        - 'baths'
        - 'sq__ft'
        - 'price'
    - convert to float:
        - 'latitude'
        - 'longitude'
    - rename 'sq__ft' to 'sq_ft'
    - convert 'sale_date' into format: "YYYY-MM-DD"


In [2]:
df = pd.read_csv("Sacramentorealestatetransactions.csv")

In [5]:
df.head(5)

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [9]:
type(df['street'])

pandas.core.series.Series

I'm a Series!

In [10]:
df['street'].head(5)

0        3526 HIGH ST
1         51 OMAHA CT
2      2796 BRANCH ST
3    2805 JANETTE WAY
4     6001 MCMAHON DR
Name: street, dtype: object

In [12]:
type(df[['street']])

pandas.core.frame.DataFrame

In [13]:
df[['street']].head(5)

Unnamed: 0,street
0,3526 HIGH ST
1,51 OMAHA CT
2,2796 BRANCH ST
3,2805 JANETTE WAY
4,6001 MCMAHON DR


In [19]:
# Let me create a copy that I can mess up
df_clean = df.copy()

## 1: Replace "__" in 'sq__ft' with "_"

In [20]:
df_clean.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [21]:
list(df_clean.columns)

['street',
 'city',
 'zip',
 'state',
 'beds',
 'baths',
 'sq__ft',
 'type',
 'sale_date',
 'price',
 'latitude',
 'longitude']

In [15]:
type(df_clean.columns)

pandas.core.indexes.base.Index

In [22]:
for i in df_clean.columns:
    print(i)

street
city
zip
state
beds
baths
sq__ft
type
sale_date
price
latitude
longitude


In [25]:
df_clean.columns = [col.replace("__","_") for col in list(df_clean.columns)]

In [26]:
df_clean.head(5)

Unnamed: 0,street,city,zip,state,beds,baths,sq_ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


## 2. Capitalize the 'city' column

In [30]:
df_clean['city'] = df_clean['city'].str.capitalize()

In [40]:
df_clean.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq_ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,Sacramento,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,Sacramento,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,Sacramento,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,Sacramento,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,Sacramento,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


## 3. Convert column types

In [34]:
df_clean.dtypes

street        object
city          object
zip            int64
state         object
beds           int64
baths          int64
sq_ft          int64
type          object
sale_date     object
price          int64
latitude     float64
longitude    float64
dtype: object

Pandas already inferred the right types when using pd.read_csv().


How would I change it if I wanted to?

In [39]:
df_clean['latitude'].astype(int)

0      38
1      38
2      38
3      38
4      38
       ..
980    38
981    38
982    38
983    38
984    38
Name: latitude, Length: 985, dtype: int64

## 4. Convert weird date string into 'YYYY-MM-DD' representation

In [42]:
df_clean["sale_date"] = pd.to_datetime(df_clean["sale_date"], infer_datetime_format=True)

In [45]:
df_clean.to_pickle("clean_sacramento.pkl")
df_clean.to_csv("clean_sacramento.csv")

## General pandas stuff, plus a little bit of numpy

In [46]:
import numpy as np

In [49]:
np.random.random_sample((5,))

array([0.96508561, 0.52115325, 0.02012193, 0.79319021, 0.10121299])

In [50]:
np.random.random_sample((3, 2))

array([[0.86798315, 0.63401669],
       [0.04835692, 0.13852062],
       [0.48915355, 0.63238242]])

In [55]:
col_names = ["banana_density", "banana_mass", "apple_density", "apple_mass"]
df = pd.DataFrame(np.random.random_sample((10, 4)), columns=col_names)

In [56]:
df

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass
0,0.115217,0.579536,0.570065,0.639582
1,0.801474,0.369596,0.885262,0.581397
2,0.099973,0.292971,0.697579,0.659084
3,0.985773,0.802603,0.650774,0.380358
4,0.218095,0.622017,0.729822,0.676884
5,0.37695,0.361918,0.438711,0.856053
6,0.103614,0.347819,0.62777,0.815891
7,0.920128,0.868313,0.35356,0.216111
8,0.059206,0.46822,0.017453,0.891227
9,0.669779,0.696239,0.270674,0.996573


## Subestting

### 1. Columns

In [59]:
apples = ["apple_density", "apple_mass", "apple_mass"]
df[apples]

Unnamed: 0,apple_density,apple_mass,apple_mass.1
0,0.570065,0.639582,0.639582
1,0.885262,0.581397,0.581397
2,0.697579,0.659084,0.659084
3,0.650774,0.380358,0.380358
4,0.729822,0.676884,0.676884
5,0.438711,0.856053,0.856053
6,0.62777,0.815891,0.815891
7,0.35356,0.216111,0.216111
8,0.017453,0.891227,0.891227
9,0.270674,0.996573,0.996573


In [62]:
df[["apple_density", "apple_mass", "apple_mass"]]

Unnamed: 0,apple_density,apple_mass,apple_mass.1
0,0.570065,0.639582,0.639582
1,0.885262,0.581397,0.581397
2,0.697579,0.659084,0.659084
3,0.650774,0.380358,0.380358
4,0.729822,0.676884,0.676884
5,0.438711,0.856053,0.856053
6,0.62777,0.815891,0.815891
7,0.35356,0.216111,0.216111
8,0.017453,0.891227,0.891227
9,0.270674,0.996573,0.996573


In [66]:
# Sorting

df.sort_values("banana_density", ascending=False)

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass
3,0.985773,0.802603,0.650774,0.380358
7,0.920128,0.868313,0.35356,0.216111
1,0.801474,0.369596,0.885262,0.581397
9,0.669779,0.696239,0.270674,0.996573
5,0.37695,0.361918,0.438711,0.856053
4,0.218095,0.622017,0.729822,0.676884
0,0.115217,0.579536,0.570065,0.639582
6,0.103614,0.347819,0.62777,0.815891
2,0.099973,0.292971,0.697579,0.659084
8,0.059206,0.46822,0.017453,0.891227


In [72]:
some_new_column = np.random.random_sample((10,))
some_new_column

array([0.50452397, 0.07582705, 0.44090607, 0.58965361, 0.26251021,
       0.73751552, 0.18216337, 0.76155813, 0.65674264, 0.83288635])

In [74]:
# Adding a column
df["mango_mass"] = some_new_column

In [77]:
pd.Series(some_new_column)

0    0.504524
1    0.075827
2    0.440906
3    0.589654
4    0.262510
5    0.737516
6    0.182163
7    0.761558
8    0.656743
9    0.832886
dtype: float64

In [75]:
df

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.115217,0.579536,0.570065,0.639582,0.504524
1,0.801474,0.369596,0.885262,0.581397,0.075827
2,0.099973,0.292971,0.697579,0.659084,0.440906
3,0.985773,0.802603,0.650774,0.380358,0.589654
4,0.218095,0.622017,0.729822,0.676884,0.26251
5,0.37695,0.361918,0.438711,0.856053,0.737516
6,0.103614,0.347819,0.62777,0.815891,0.182163
7,0.920128,0.868313,0.35356,0.216111,0.761558
8,0.059206,0.46822,0.017453,0.891227,0.656743
9,0.669779,0.696239,0.270674,0.996573,0.832886


In [79]:
df_new = df.sort_values("banana_density", ascending=False)

In [83]:
df_new["mango_mass"] = pd.Series(some_new_column)

In [84]:
df_new

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
3,0.985773,0.802603,0.650774,0.380358,0.589654
7,0.920128,0.868313,0.35356,0.216111,0.761558
1,0.801474,0.369596,0.885262,0.581397,0.075827
9,0.669779,0.696239,0.270674,0.996573,0.832886
5,0.37695,0.361918,0.438711,0.856053,0.737516
4,0.218095,0.622017,0.729822,0.676884,0.26251
0,0.115217,0.579536,0.570065,0.639582,0.504524
6,0.103614,0.347819,0.62777,0.815891,0.182163
2,0.099973,0.292971,0.697579,0.659084,0.440906
8,0.059206,0.46822,0.017453,0.891227,0.656743


In [86]:
df_new.reset_index(drop=True)

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.985773,0.802603,0.650774,0.380358,0.589654
1,0.920128,0.868313,0.35356,0.216111,0.761558
2,0.801474,0.369596,0.885262,0.581397,0.075827
3,0.669779,0.696239,0.270674,0.996573,0.832886
4,0.37695,0.361918,0.438711,0.856053,0.737516
5,0.218095,0.622017,0.729822,0.676884,0.26251
6,0.115217,0.579536,0.570065,0.639582,0.504524
7,0.103614,0.347819,0.62777,0.815891,0.182163
8,0.099973,0.292971,0.697579,0.659084,0.440906
9,0.059206,0.46822,0.017453,0.891227,0.656743


In [87]:
df_new = df_new.reset_index(drop=True)

In [88]:
df_new

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.985773,0.802603,0.650774,0.380358,0.589654
1,0.920128,0.868313,0.35356,0.216111,0.761558
2,0.801474,0.369596,0.885262,0.581397,0.075827
3,0.669779,0.696239,0.270674,0.996573,0.832886
4,0.37695,0.361918,0.438711,0.856053,0.737516
5,0.218095,0.622017,0.729822,0.676884,0.26251
6,0.115217,0.579536,0.570065,0.639582,0.504524
7,0.103614,0.347819,0.62777,0.815891,0.182163
8,0.099973,0.292971,0.697579,0.659084,0.440906
9,0.059206,0.46822,0.017453,0.891227,0.656743


In [94]:
df_super_new = (df_new
                .sort_values("mango_mass")
                .reset_index(drop=True))

In [95]:
df_super_new

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.801474,0.369596,0.885262,0.581397,0.075827
1,0.103614,0.347819,0.62777,0.815891,0.182163
2,0.218095,0.622017,0.729822,0.676884,0.26251
3,0.099973,0.292971,0.697579,0.659084,0.440906
4,0.115217,0.579536,0.570065,0.639582,0.504524
5,0.985773,0.802603,0.650774,0.380358,0.589654
6,0.059206,0.46822,0.017453,0.891227,0.656743
7,0.37695,0.361918,0.438711,0.856053,0.737516
8,0.920128,0.868313,0.35356,0.216111,0.761558
9,0.669779,0.696239,0.270674,0.996573,0.832886


In [99]:
df_super_new["banana_mass"] < 0.5

0     True
1     True
2    False
3     True
4    False
5    False
6     True
7     True
8    False
9    False
Name: banana_mass, dtype: bool

In [98]:
df_super_new[df_super_new["banana_mass"] < 0.5]

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.115217,0.579536,0.570065,0.639582,0.504524
1,0.801474,0.369596,0.885262,0.581397,0.075827
3,0.985773,0.802603,0.650774,0.380358,0.589654
6,0.103614,0.347819,0.62777,0.815891,0.182163
7,0.920128,0.868313,0.35356,0.216111,0.761558


In [101]:
df_super_new[[True, False, False, True, False, False, True, False, False, True]]

Unnamed: 0,banana_density,banana_mass,apple_density,apple_mass,mango_mass
0,0.115217,0.579536,0.570065,0.639582,0.504524
3,0.985773,0.802603,0.650774,0.380358,0.589654
6,0.103614,0.347819,0.62777,0.815891,0.182163
9,0.669779,0.696239,0.270674,0.996573,0.832886


In [113]:
df_super_new[(df_super_new["mango_mass"] > .1) & (df_super_new["apple_density"] < .9)][["banana_mass", "apple_density"]]

Unnamed: 0,banana_mass,apple_density
1,0.347819,0.62777
2,0.622017,0.729822
3,0.292971,0.697579
4,0.579536,0.570065
5,0.802603,0.650774
6,0.46822,0.017453
7,0.361918,0.438711
8,0.868313,0.35356
9,0.696239,0.270674
