# Pandas
[Official website](https://pandas.pydata.org/)

In [13]:
from platform import python_version

print(f"Python notebook version: {python_version()}")


Python notebook version: 3.8.2


In [14]:
import numpy as np
import pandas as pd

In [15]:
# Check versions
print(f'Numpy library version: {np.__version__}')
print(f'Pandas library version: {pd.__version__}')

Numpy library version: 1.18.4
Pandas library version: 1.0.3


## Load data

Dataframe: index and series

In [16]:
# Create a DataFrame from a dictionary
df1 = pd.DataFrame([{'c1':10, 'c2':100}, {'c1':11,'c2':110}, {'c1':12,'c2':120}])
df1

Unnamed: 0,c1,c2
0,10,100
1,11,110
2,12,120


In [17]:
# Specifing columns
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [18]:
# Define data sequence
dates = pd.date_range("today", periods=6)
# Create random number dimensinal matrix by np
num_arr = np.random.randn(6,4)
# Column could be anything, not only numbers
columns = ["a", "b", "c", "d"]
# Create data frame
df3 = pd.DataFrame(num_arr, index=dates, columns=columns)
df3

Unnamed: 0,a,b,c,d
2020-05-05 04:55:35.356593,-1.090768,1.120754,1.349866,-0.281359
2020-05-06 04:55:35.356593,-0.203348,1.0222,-1.365128,-1.21027
2020-05-07 04:55:35.356593,0.283643,1.644119,1.152541,0.106228
2020-05-08 04:55:35.356593,-1.856182,-1.56747,-0.502503,-0.794417
2020-05-09 04:55:35.356593,-0.868879,0.461857,0.997237,0.77824
2020-05-10 04:55:35.356593,0.948191,1.908986,0.916235,0.513582


In [19]:
# Show index
df3.index

DatetimeIndex(['2020-05-05 04:55:35.356593', '2020-05-06 04:55:35.356593',
               '2020-05-07 04:55:35.356593', '2020-05-08 04:55:35.356593',
               '2020-05-09 04:55:35.356593', '2020-05-10 04:55:35.356593'],
              dtype='datetime64[ns]', freq='D')

In [20]:
# and colums
df3.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

In [21]:
# As Pandas DataFrame from a file: csv, xslx, txt
df = pd.read_csv("housing.csv")
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [22]:
# Show columns types
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [23]:
# Show some statistics
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [24]:
# Show the head of the table
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [25]:
# Or the tail
df.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [26]:
# Check null
df3.isnull()

Unnamed: 0,a,b,c,d
2020-05-05 04:55:35.356593,False,False,False,False
2020-05-06 04:55:35.356593,False,False,False,False
2020-05-07 04:55:35.356593,False,False,False,False
2020-05-08 04:55:35.356593,False,False,False,False
2020-05-09 04:55:35.356593,False,False,False,False
2020-05-10 04:55:35.356593,False,False,False,False


## Read

In [27]:
# Read headers
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [28]:
# Auto integer index
df.index

RangeIndex(start=0, stop=20640, step=1)

In [29]:
# Read the firsrt ten row of a column
df["longitude"].iloc[0:10]

0   -122.23
1   -122.22
2   -122.24
3   -122.25
4   -122.25
5   -122.25
6   -122.25
7   -122.25
8   -122.26
9   -122.25
Name: longitude, dtype: float64

In [30]:
# Read a row by index location method
df.iloc[19]

longitude              -122.27
latitude                 37.84
housing_median_age          52
total_rooms               1503
total_bedrooms             298
population                 690
households                 275
median_income           2.6033
median_house_value      162900
ocean_proximity       NEAR BAY
Name: 19, dtype: object

## Iterate

In [31]:
for index, row in df1.iterrows():
    print(index, row["c1"])

0 10
1 11
2 12


## Filter

In [32]:
# Slicing dataframe
df.iloc[1:20]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY
10,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,281500.0,NEAR BAY


In [33]:
# and filter
df.iloc[1:20].loc[df["housing_median_age"] == 21.0]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [34]:
# Locate group of row of specific one
df.loc[(df["ocean_proximity"] == "NEAR BAY") | (df["ocean_proximity"] == "INLAND")]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [35]:
# String like ...
df.loc[df["ocean_proximity"].str.contains("LAND")]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
954,-121.92,37.64,46.0,1280.0,209.0,512.0,208.0,5.1406,315600.0,INLAND
957,-121.90,37.66,18.0,7397.0,1137.0,3126.0,1115.0,6.4994,323000.0,INLAND
965,-121.88,37.68,23.0,2234.0,270.0,854.0,286.0,7.3330,337200.0,INLAND
967,-121.88,37.67,16.0,4070.0,624.0,1543.0,577.0,6.5214,311500.0,INLAND
968,-121.88,37.67,25.0,2244.0,301.0,937.0,324.0,6.4524,296900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [36]:
# String not like ... (not ! => ~)
df.loc[~df["ocean_proximity"].str.contains("LAND")]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20502,-118.68,34.33,45.0,121.0,25.0,67.0,27.0,2.9821,325000.0,<1H OCEAN
20503,-118.75,34.33,27.0,534.0,85.0,243.0,77.0,8.2787,330000.0,<1H OCEAN
20504,-118.73,34.29,11.0,5451.0,736.0,2526.0,752.0,7.3550,343900.0,<1H OCEAN
20505,-118.72,34.29,22.0,3266.0,529.0,1595.0,494.0,6.0368,248000.0,<1H OCEAN


In [37]:
import re
# Use regular expressions, ignoring capital cases (flags) if you want
df.loc[df["ocean_proximity"].str.contains("bay|ocean", regex=True, flags=re.I)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20502,-118.68,34.33,45.0,121.0,25.0,67.0,27.0,2.9821,325000.0,<1H OCEAN
20503,-118.75,34.33,27.0,534.0,85.0,243.0,77.0,8.2787,330000.0,<1H OCEAN
20504,-118.73,34.29,11.0,5451.0,736.0,2526.0,752.0,7.3550,343900.0,<1H OCEAN
20505,-118.72,34.29,22.0,3266.0,529.0,1595.0,494.0,6.0368,248000.0,<1H OCEAN


## Sort

In [38]:
# One arguments
df.sort_values(["housing_median_age"], ascending=False)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
18881,-122.25,38.10,52.0,248.0,86.0,173.0,69.0,2.3000,109400.0,NEAR BAY
1671,-122.21,38.06,52.0,2735.0,559.0,1076.0,487.0,3.6154,155700.0,NEAR BAY
19116,-122.65,38.23,52.0,1735.0,347.0,712.0,343.0,3.1711,200800.0,<1H OCEAN
19516,-121.01,37.64,52.0,201.0,35.0,74.0,22.0,1.3036,75000.0,INLAND
16200,-121.27,37.95,52.0,1318.0,308.0,1368.0,310.0,1.8261,54600.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
12077,-117.64,33.87,2.0,17470.0,2727.0,5964.0,1985.0,6.2308,257900.0,<1H OCEAN
18972,-122.00,38.23,1.0,2062.0,343.0,872.0,268.0,5.2636,191300.0,INLAND
12286,-116.95,33.86,1.0,6.0,2.0,8.0,2.0,1.6250,55000.0,INLAND
3130,-117.95,35.08,1.0,83.0,15.0,32.0,15.0,4.8750,141700.0,INLAND


In [39]:
# Or more
df.sort_values(["housing_median_age", "ocean_proximity"], ascending=[True, False])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
3130,-117.95,35.08,1.0,83.0,15.0,32.0,15.0,4.8750,141700.0,INLAND
12286,-116.95,33.86,1.0,6.0,2.0,8.0,2.0,1.6250,55000.0,INLAND
18972,-122.00,38.23,1.0,2062.0,343.0,872.0,268.0,5.2636,191300.0,INLAND
19536,-120.93,37.65,1.0,2254.0,328.0,402.0,112.0,4.2500,189200.0,INLAND
15444,-117.26,33.19,2.0,2629.0,509.0,1044.0,522.0,4.2361,158500.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
19266,-122.72,38.44,52.0,188.0,62.0,301.0,72.0,0.9437,129200.0,<1H OCEAN
19341,-122.87,38.62,52.0,1514.0,348.0,767.0,354.0,2.1903,160000.0,<1H OCEAN
19342,-122.86,38.61,52.0,1753.0,380.0,982.0,380.0,3.4013,183300.0,<1H OCEAN
20141,-119.06,34.36,52.0,1409.0,359.0,981.0,304.0,2.7951,199300.0,<1H OCEAN


## Aggregation

In [40]:
# Distinct value in a column
df["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [41]:
# Group by and how
df.groupby(["ocean_proximity"]).mean().sort_values("median_income")

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
ocean_proximity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ISLAND,-118.354,33.358,42.4,1574.6,420.4,668.0,276.6,2.74442,380440.0
INLAND,-119.73299,36.731829,24.271867,2717.742787,533.881619,1391.046252,477.447565,3.208996,124805.392001
NEAR OCEAN,-119.332555,34.738439,29.347254,2583.700903,538.615677,1354.008653,501.244545,4.005785,249433.977427
NEAR BAY,-122.260694,37.801057,37.730131,2493.58952,514.182819,1230.317467,488.616157,4.172885,259212.31179
<1H OCEAN,-118.847766,34.560577,29.279225,2628.343586,546.539185,1520.290499,517.744965,4.230682,240084.285464


In [42]:
df.groupby(["ocean_proximity"]).std()

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
ocean_proximity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
<1H OCEAN,1.588888,1.467127,11.644453,2160.463696,427.911417,1185.848357,392.280718,2.001223,106124.292213
INLAND,1.90095,2.116073,12.01802,2385.831111,446.117778,1168.670126,392.252095,1.437465,70007.908494
ISLAND,0.070569,0.040866,13.164346,707.545264,169.320111,301.691067,113.200265,0.44418,80559.561816
NEAR BAY,0.147004,0.185434,13.070385,1830.817022,367.887605,885.899035,350.598369,2.017427,122818.537064
NEAR OCEAN,2.327307,2.275386,11.840371,1990.72476,376.320045,1005.563166,344.445256,2.010558,122477.145927


## Transformation

In [43]:
# Transpose
df3.T

Unnamed: 0,2020-05-05 04:55:35.356593,2020-05-06 04:55:35.356593,2020-05-07 04:55:35.356593,2020-05-08 04:55:35.356593,2020-05-09 04:55:35.356593,2020-05-10 04:55:35.356593
a,-1.090768,-0.203348,0.283643,-1.856182,-0.868879,0.948191
b,1.120754,1.0222,1.644119,-1.56747,0.461857,1.908986
c,1.349866,-1.365128,1.152541,-0.502503,0.997237,0.916235
d,-0.281359,-1.21027,0.106228,-0.794417,0.77824,0.513582


In [44]:
# Create new colums
df["value_by_income"] = df["median_house_value"] / df["median_income"]
# Show subset
df[["value_by_income", "median_house_value", "median_income"]]

Unnamed: 0,value_by_income,median_house_value,median_income
0,54365.060299,452600.0,8.3252
1,43185.486785,358500.0,8.3014
2,48515.997465,352100.0,7.2574
3,60480.941327,341300.0,5.6431
4,88970.932349,342200.0,3.8462
...,...,...,...
20635,50054.476703,78100.0,1.5603
20636,30154.881101,77100.0,2.5568
20637,54294.117647,92300.0,1.7000
20638,45362.039417,84700.0,1.8672


In [45]:
# Drop a column
df = df.drop(columns=["value_by_income"])
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [46]:
# Update our dataset filtering it. Reset indexes (reset_index)
df = df.loc[ (df["housing_median_age"] >= 10) & (df["housing_median_age"] <= 20)].reset_index(drop=True)
# Updated dataset 4988x10
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.29,37.81,20.0,835.0,161.0,290.0,133.0,2.4830,137500.0,NEAR BAY
1,-122.28,37.81,17.0,1237.0,462.0,762.0,439.0,0.9241,177500.0,NEAR BAY
2,-122.28,37.81,19.0,1207.0,243.0,721.0,207.0,1.1111,108300.0,NEAR BAY
3,-122.28,37.81,17.0,924.0,289.0,609.0,289.0,1.5000,162500.0,NEAR BAY
4,-122.27,37.81,10.0,875.0,348.0,546.0,330.0,0.7600,162500.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
4983,-121.45,39.26,15.0,2319.0,416.0,1047.0,385.0,3.1250,115600.0,INLAND
4984,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
4985,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
4986,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [47]:
# Change values
df.loc[df["ocean_proximity"] == "<1H OCEAN", "ocean_proximity"] = "< OCEAN"
# Check filter by
df.loc[df["ocean_proximity"].str.contains("<", regex=True)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
60,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,< OCEAN
61,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,< OCEAN
66,-121.99,37.56,18.0,5505.0,1005.0,2641.0,971.0,5.0000,269700.0,< OCEAN
67,-121.99,37.56,20.0,6462.0,1294.0,3288.0,1235.0,4.3393,231200.0,< OCEAN
68,-121.97,37.56,13.0,8918.0,1823.0,4518.0,1772.0,4.8052,254000.0,< OCEAN
...,...,...,...,...,...,...,...,...,...,...
4940,-118.68,34.27,16.0,4637.0,941.0,2476.0,878.0,4.0568,225200.0,< OCEAN
4941,-118.67,34.27,15.0,3221.0,659.0,1390.0,607.0,3.5313,191800.0,< OCEAN
4942,-118.67,34.27,10.0,3753.0,678.0,1859.0,660.0,4.9946,204600.0,< OCEAN
4943,-118.71,34.30,20.0,1586.0,187.0,699.0,209.0,6.5483,335000.0,< OCEAN


#### Operations with missing values

In [48]:
# Dataframe with null
df5 = pd.DataFrame(np.array([[1, 2, 3], [np.nan, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])
df5.isnull()

Unnamed: 0,a,b,c
0,False,False,False
1,True,False,False
2,False,False,False


In [49]:
# Calculate mean and fill null values
df2.fillna(df2["a"].mean())

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [50]:
# Could drop this rows
df6 = pd.DataFrame(np.array([[1, 2, 3], [np.nan, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])
df6.dropna(how="any")

Unnamed: 0,a,b,c
0,1.0,2.0,3.0
2,7.0,8.0,9.0


#### Remove duplicates

In [51]:
data = {"Name": ["James", "Alice", "Phil", "James"],
        "Age": [24, 28, 40, 24],
        "Sex": ["Male", "Female", "Male", "Male"]}
df_duplicates = pd.DataFrame(data)
df_duplicates

Unnamed: 0,Name,Age,Sex
0,James,24,Male
1,Alice,28,Female
2,Phil,40,Male
3,James,24,Male


In [52]:
df_without_duplicates = df_duplicates.drop_duplicates()
df_without_duplicates

Unnamed: 0,Name,Age,Sex
0,James,24,Male
1,Alice,28,Female
2,Phil,40,Male
