In [1]:
%config IPCompleter.greedy=True

In [2]:
import pandas as pd
income = pd.read_csv('Data/income.csv')


### Filtering


In [3]:
# To filter only those rows which have Index as "C" 

income[income.Index =='C']

# Alternetaively

income.loc[income.Index == 'C', :]

Unnamed: 0,Index,State,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015
4,C,California,1685349,1675807,1889570,1480280,1735069,1812546,1487315,1663809,1624509,1639670,1921845,1156536,1388461,1644607
5,C,Colorado,1343824,1878473,1886149,1236697,1871471,1814218,1875146,1752387,1913275,1665877,1491604,1178355,1383978,1330736
6,C,Connecticut,1610512,1232844,1181949,1518933,1841266,1976976,1764457,1972730,1968730,1945524,1228529,1582249,1503156,1718072


In [4]:
# To select the States having Index as "C":

income.loc[income.Index=='C', "State"]

# Alternetaively

income.loc[income.Index=="C", :].State

4     California
5       Colorado
6    Connecticut
Name: State, dtype: object

In [5]:
# To filter the rows with Index as "M" and income for 2008 > 2000000"

income.loc[(income.Index == "M") & (income["Y2008"]>1500000), :]

Unnamed: 0,Index,State,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015
21,M,Massachusetts,1647582,1686259,1620601,1777250,1531641,1380529,1978904,1567651,1761048,1658538,1482203,1731917,1669749,1963337
22,M,Michigan,1295635,1149931,1601027,1340716,1729449,1567494,1990431,1575185,1267626,1274673,1709853,1815596,1965196,1646634
25,M,Missouri,1221316,1858368,1773451,1573967,1374863,1486197,1735099,1800620,1164202,1425363,1800052,1698105,1767835,1996005


In [6]:
# To filter the rows with index either "K" or "M", we can use isin( ) function:

income.loc[(income.Index =="K") | (income.Index=='M'), :]

# Alternetaively

income.loc[income.Index.isin(["K", "M"]), :]


Unnamed: 0,Index,State,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015
16,K,Kansas,1509054,1290700,1522230,1532094,1104256,1863278,1949478,1561528,1550433,1465812,1882929,1410249,1930090,1385528
17,K,Kentucky,1813878,1448846,1800760,1250524,1137913,1911227,1301848,1956681,1350895,1512894,1916616,1878271,1722762,1913350
19,M,Maine,1582720,1678622,1208496,1912040,1438549,1330014,1295877,1969163,1627262,1706080,1437088,1318546,1116792,1529233
20,M,Maryland,1579713,1404700,1849798,1397738,1310270,1789128,1112765,1967225,1486246,1872327,1175819,1314343,1979529,1569566
21,M,Massachusetts,1647582,1686259,1620601,1777250,1531641,1380529,1978904,1567651,1761048,1658538,1482203,1731917,1669749,1963337
22,M,Michigan,1295635,1149931,1601027,1340716,1729449,1567494,1990431,1575185,1267626,1274673,1709853,1815596,1965196,1646634
23,M,Minnesota,1729921,1675204,1903907,1561839,1985692,1148621,1328133,1890633,1995304,1575533,1910216,1972021,1515366,1864553
24,M,Mississippi,1983285,1292558,1631325,1943311,1354579,1731643,1428291,1568049,1383227,1629132,1988270,1907777,1649668,1991232
25,M,Missouri,1221316,1858368,1773451,1573967,1374863,1486197,1735099,1800620,1164202,1425363,1800052,1698105,1767835,1996005
26,M,Montana,1877154,1540099,1332722,1273327,1625721,1983568,1251742,1592690,1350619,1520064,1185225,1465705,1110394,1125903


In [7]:
# Alternatively query( ) function can be used which  eliminates the need to specify data frame while mentioning column(s)

income.query("Y2005>1900000 & Y2009>1800000")

Unnamed: 0,Index,State,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015
19,M,Maine,1582720,1678622,1208496,1912040,1438549,1330014,1295877,1969163,1627262,1706080,1437088,1318546,1116792,1529233
39,R,Rhode Island,1501744,1942942,1266657,1961923,1835983,1234040,1151409,1993136,1983569,1781016,1909119,1531212,1990412,1611730
43,T,Texas,1520591,1310777,1957713,1907326,1873544,1655483,1785986,1827503,1447457,1978374,1882532,1698698,1646508,1705322


### Dealing with missing values


In [8]:
import numpy as np

mydata = {'Crop': ['Rice', 'Wheat', 'Barley', 'Maize'],
        'Yield': [1010, 1025.2, 1404.2, 1251.7],
        'cost' : [102, np.nan, 20, 68]}
crops = pd.DataFrame(mydata)
crops


Unnamed: 0,Crop,Yield,cost
0,Rice,1010.0,102.0
1,Wheat,1025.2,
2,Barley,1404.2,20.0
3,Maize,1251.7,68.0


In [9]:
# isnull( ) returns True and notnull( ) returns False if the value is NaN

crops.isnull()

Unnamed: 0,Crop,Yield,cost
0,False,False,False
1,False,False,True
2,False,False,False
3,False,False,False


In [10]:
crops.notnull()

Unnamed: 0,Crop,Yield,cost
0,True,True,True
1,True,True,False
2,True,True,True
3,True,True,True


In [11]:
#No. of missing values

crops.isnull().sum()

Crop     0
Yield    0
cost     1
dtype: int64

In [12]:
# shows the rows with NAs.

crops[crops.cost.isnull()]

Unnamed: 0,Crop,Yield,cost
1,Wheat,1025.2,


In [13]:
# shows the rows with NAs in crops.Crop

crops[crops.cost.isnull()].Crop

1    Wheat
Name: Crop, dtype: object

In [14]:
# shows the rows without NAs in crops.Crop

crops[crops.cost.notnull()].Crop

0      Rice
2    Barley
3     Maize
Name: Crop, dtype: object

In [15]:
# To drop all the rows which have missing values in any rows we use dropna(how = "any").
# By default inplace = False . 

crops.dropna(how = "any")
crops.dropna(how = 'any').shape

(3, 3)

In [16]:
# how = "all" means drop a row if all the elements in that row are missing

crops.dropna(how = 'all').shape

(4, 3)

In [17]:
# to remove NaNs if any of 'Yield' or'cost' are missing
# we use the subset parameter and pass a list:

crops.dropna(subset = ['Yield', 'cost'], how='any').shape
crops.dropna(subset= ['Yield', 'cost'], how = 'all').shape

(4, 3)

In [18]:
crops['cost'].fillna(value='Unknown', inplace=True)
crops

Unnamed: 0,Crop,Yield,cost
0,Rice,1010.0,102
1,Wheat,1025.2,Unknown
2,Barley,1404.2,20
3,Maize,1251.7,68


### Duplicates

In [19]:
data = pd.DataFrame({"Items" : ["TV","Washing Machine","Mobile","TV","TV","Washing Machine", "Washing Machine"], "Price" : [10000,50000,20000,10000,10000,40000, 50000]})
data

Unnamed: 0,Items,Price
0,TV,10000
1,Washing Machine,50000
2,Mobile,20000
3,TV,10000
4,TV,10000
5,Washing Machine,40000
6,Washing Machine,50000


In [20]:
# duplicated() returns a logical vector returning True
# when encounters duplicated.

data.loc[data.duplicated(), :]

Unnamed: 0,Items,Price
3,TV,10000
4,TV,10000
6,Washing Machine,50000


In [21]:
# By default keep = 'first' i.e. the first occurence is considered a unique value
# and its repetitions are considered as duplicates.

data.loc[data.duplicated(keep='first'), :]

Unnamed: 0,Items,Price
3,TV,10000
4,TV,10000
6,Washing Machine,50000


In [22]:
# If keep = "last" the last occurence is considered a unique value
# and all its repetitions are considered as duplicates.

data.loc[data.duplicated(keep='last'), :]

Unnamed: 0,Items,Price
0,TV,10000
1,Washing Machine,50000
3,TV,10000


In [23]:
# If keep = "False" then it considers all the occurences of the repeated
# observations as duplicates.

data.loc[data.duplicated(keep = False)]

Unnamed: 0,Items,Price
0,TV,10000
1,Washing Machine,50000
3,TV,10000
4,TV,10000
6,Washing Machine,50000


In [24]:
# drop_duplicates is used with default inplace = False, 
# keep = 'first' or 'last' or 'False' have the respective meanings as in duplicated( )

data.drop_duplicates(keep='first')
data.drop_duplicates(keep=False, inplace=True)
data


Unnamed: 0,Items,Price
2,Mobile,20000
5,Washing Machine,40000
