In [1]:
import pandas as pd

## Use `.apply` to send a column of every row to a function

You can use `.apply` to send a single column to a function. This is useful when cleaning up data - converting formats, altering values etc.

In [2]:
# What's our data look like?
path = '/Volumes/HDD/data/HousingSalesKC/kc_house_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [37]:
names_dict = {3:'colleage', 4:'family', 2:'couples', 1:'single'}

In [38]:
df['desc'] = df['bedrooms'].map(names_dict)

In [11]:
# Get rid of $ and , in the SAL-RATE, then convert it to a float
def money_to_float(money_str):
    return float(money_str.replace("$","").replace(",",""))

#df['SAL-RATE'].apply(money_to_float)

In [4]:
# Save the result in a new column
df['salary'] = df['SAL-RATE'].apply(money_to_float)

In [26]:
df['desc'] = df['desc'].astype(str)

In [39]:
df['desc2'] = df['desc'].replace("family","Family")
df['desc2']

0        colleage
1        colleage
2         couples
3          Family
4        colleage
           ...   
21608    colleage
21609      Family
21610     couples
21611    colleage
21612     couples
Name: desc2, Length: 21613, dtype: object

In [12]:
# Take a peek
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,desc
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,colleage
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,colleage
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,couples
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,family
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,colleage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,colleage
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,2310,0,2014,0,98146,47.5107,-122.362,1830,7200,family
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,1020,0,2009,0,98144,47.5944,-122.299,1020,2007,couples
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,colleage


## Use `.apply` with `axis=1` to send every single row to a function

You can also send an **entire row at a time** instead of just a single column. Use this if you need to use **multiple columns to get a result**.

In [17]:
# Create a dataframe from a list of dictionaries
df2 = pd.DataFrame([
    { 'height': 40, 'width': 10 },
    { 'height': 20, 'width': 9 },
    { 'height': 3.4, 'width': 4 }
])

df2

Unnamed: 0,height,width
0,40.0,10
1,20.0,9
2,3.4,4


In [18]:
# Use the height and width to calculate the area
def calculate_area(row):
    return row['height'] * row['width']

df2.apply(calculate_area, axis=1)

0    400.0
1    180.0
2     13.6
dtype: float64

In [19]:
# Use .apply to save the new column if we'd like
df2['area'] = df2.apply(calculate_area, axis=1)
df2

Unnamed: 0,height,width,area
0,40.0,10,400.0
1,20.0,9,180.0
2,3.4,4,13.6
