# Data Frames

Why use Pandas?

Data scientists make use of Pandas in Python for its following advantages:

    Easily handles missing data
    It uses Series for one-dimensional data structure and DataFrame for multi-dimensional data structure
    It provides an efficient way to slice the data
    It provides a flexible way to merge, concatenate or reshape the data
    It includes a powerful time series tool to work with
    
In a nutshell, Pandas is a useful library in data analysis. It can be used to perform data manipulation and analysis. Pandas provide powerful and easy-to-use data structures, as well as the means to quickly perform operations on these structures.


In [1]:
import pandas as pd
import numpy as np

# Creating a DataFrame

In [3]:
data=[1,3,5,7,9,18]
data

[1, 3, 5, 7, 9, 18]

In [4]:
pd.DataFrame(data,columns=["column1"])

Unnamed: 0,column1
0,1
1,3
2,5
3,7
4,9
5,18


In [5]:
pd.Series(data=data , name="column1")

0     1
1     3
2     5
3     7
4     9
5    18
Name: column1, dtype: int64

# Creating a DataFrame Using a Numpy Arrays

In [6]:
data=np.arange(1,24,2).reshape(3,4)

In [7]:
data

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [8]:
df=pd.DataFrame(data=data, columns=["var1","var2","var3","var4"])
df

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


# Creating a DataFrame Using a Dictionary

In [9]:
s1=np.random.randint(2,10,size=4)
s2=np.random.randint(3,10,size=4)
s3=np.random.randint(4,15,size=4)

In [10]:
s1

array([5, 3, 6, 9])

In [11]:
s2

array([8, 8, 4, 9])

In [12]:
s3

array([14,  5,  5, 11])

In [13]:
myDict={"var1":s1,"var2":s2,"var3":s3}

In [14]:
df=pd.DataFrame(myDict)

In [15]:
df

Unnamed: 0,var1,var2,var3
0,5,8,14
1,3,8,5
2,6,4,5
3,9,9,11


# The Examination of Some Attributes on Data

In [17]:
df.head(2)

Unnamed: 0,var1,var2,var3
0,5,8,14
1,3,8,5


In [18]:
df.tail(2)

Unnamed: 0,var1,var2,var3
2,6,4,5
3,9,9,11


In [19]:
df.sample()

Unnamed: 0,var1,var2,var3
3,9,9,11


In [20]:
df.sample(2)

Unnamed: 0,var1,var2,var3
2,6,4,5
1,3,8,5


In [21]:
df.columns

Index(['var1', 'var2', 'var3'], dtype='object')

In [23]:
for i in df.columns:
    print(i)

var1
var2
var3


In [24]:
for i in df.index:
    print(i)

0
1
2
3


In [25]:
for i in df.columns:
    print(df[i].mean())

5.75
7.25
8.75


In [26]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [27]:
[i for i in df.index]

[0, 1, 2, 3]

In [28]:
[i for i in df.columns]

['var1', 'var2', 'var3']

In [29]:
df.columns=["new1","new2","new3"]
df

Unnamed: 0,new1,new2,new3
0,5,8,14
1,3,8,5
2,6,4,5
3,9,9,11


In [30]:
df.index=["a","b","c","d"]
df

Unnamed: 0,new1,new2,new3
a,5,8,14
b,3,8,5
c,6,4,5
d,9,9,11


In [31]:
df.rename(columns={"new1":"a","new2":"b"})

Unnamed: 0,a,b,new3
a,5,8,14
b,3,8,5
c,6,4,5
d,9,9,11


In [33]:
df.rename(index={"a":1,"b":2})

Unnamed: 0,new1,new2,new3
1,5,8,14
2,3,8,5
c,6,4,5
d,9,9,11


In [34]:
df

Unnamed: 0,new1,new2,new3
a,5,8,14
b,3,8,5
c,6,4,5
d,9,9,11


In [35]:
df.shape

(4, 3)

In [36]:
df.size

12

In [38]:
type(df)

pandas.core.frame.DataFrame

In [40]:
len(df)

4

In [41]:
df.ndim

2

In [42]:
type(df.values)

numpy.ndarray

In [43]:
type(df["new1"])

pandas.core.series.Series

In [44]:
"new2" in df

True

In [45]:
"new5" in df 

False

# Indexing, Slicing & Selection

In [46]:
from numpy.random import randn

In [47]:

np.random.seed(101)
df=pd.DataFrame(randn(5,4) ,index="A B C D E".split(), columns="W X Y Z".split())
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [48]:
df["Y"]

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [49]:
df.Y

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [50]:
type(df["Y"])

pandas.core.series.Series

In [52]:
df[["Y"]]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


In [53]:
type(df[["Y"]])

pandas.core.frame.DataFrame

In [54]:
df["B":"D"]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [55]:
df["A":"C"][["Y","Z"]]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
C,0.528813,-0.589001


# Creating a New Column

In [59]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [60]:
df["new1"]=df["X"]*df["Y"]

In [61]:
df

Unnamed: 0,W,X,Y,Z,new1
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


In [62]:
df["new2"]=np.arange(5)
df

Unnamed: 0,W,X,Y,Z,new1,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,0
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,1
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,2
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,3
E,0.190794,1.978757,2.605967,0.683509,5.156577,4


# Removing Columns

In [63]:
df.drop("new2",axis=1)

Unnamed: 0,W,X,Y,Z,new1
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


In [64]:
df

Unnamed: 0,W,X,Y,Z,new1,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,0
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,1
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,2
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,3
E,0.190794,1.978757,2.605967,0.683509,5.156577,4


In [66]:
df.drop(["new1","new2"],axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [67]:
df

Unnamed: 0,W,X,Y,Z,new1,new2
A,2.70685,0.628133,0.907969,0.503826,0.570325,0
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,1
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,2
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,3
E,0.190794,1.978757,2.605967,0.683509,5.156577,4


In [68]:
df.drop(["new1","new2"],axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


# Removing Rows

In [69]:
df.drop("C",axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [70]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [71]:
df.drop(index=["B"])

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [72]:
df_temp=df.drop("C",axis=0)
df_temp

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [73]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


# Selecting Rows and Columns using .loc[ ] and iloc[ ]

In [74]:
data=np.random.randint(1,40,size=(8,4))
df=pd.DataFrame(data, columns=["var1","var2","var3","var4"])
df


Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [75]:
df.loc[4]

var1    20
var2    36
var3    31
var4    11
Name: 4, dtype: int32

In [76]:
df.loc[[4]]

Unnamed: 0,var1,var2,var3,var4
4,20,36,31,11


In [77]:
df.loc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23


In [78]:
df.iloc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11


In [79]:
df.index="a b c d e f g h".split()
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [80]:
df.iloc[1:4]

Unnamed: 0,var1,var2,var3,var4
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37


In [83]:
df.loc["c":"g"]

Unnamed: 0,var1,var2,var3,var4
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23


In [84]:
df.loc["d","var3"]

25

In [85]:
df.iloc[3,2]

25

In [86]:
df.loc["d":"g","var2"]

d    30
e    36
f    28
g    24
Name: var2, dtype: int32

In [87]:
df.loc["d":"g"][["var3"]]

Unnamed: 0,var3
d,25
e,31
f,9
g,38


In [88]:
df.loc["d":"g"][["var2","var3"]]

Unnamed: 0,var2,var3
d,30,25
e,36,31
f,28,9
g,24,38


In [89]:
df.iloc[ 2:5 , [2]]

Unnamed: 0,var3
c,12
d,25
e,31


In [90]:
df.iloc[2:5][["var3"]]

Unnamed: 0,var3
c,12
d,25
e,31


In [91]:
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [92]:
df.loc["a","var1"]

8

In [93]:
df.loc[["a"],["var1"]]

Unnamed: 0,var1
a,8


In [94]:
df.loc[["a","c"],["var1","var3"]]

Unnamed: 0,var1,var3
a,8,39
c,13,12


In [95]:
df.iloc[[0,2],[2,0]]

Unnamed: 0,var3,var1
a,39,8
c,12,13


# Conditional Selection

In [96]:
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [97]:
df > 10

Unnamed: 0,var1,var2,var3,var4
a,False,True,True,False
b,True,False,True,False
c,True,True,True,True
d,True,True,True,True
e,True,True,True,True
f,True,True,False,True
g,True,True,True,True
h,False,False,True,True


In [98]:
df[df > 10]

Unnamed: 0,var1,var2,var3,var4
a,,11.0,39.0,
b,19.0,,16.0,
c,13.0,18.0,12.0,16.0
d,34.0,30.0,25.0,37.0
e,20.0,36.0,31.0,11.0
f,21.0,28.0,,23.0
g,27.0,24.0,38.0,23.0
h,,,19.0,29.0


In [99]:
df[df["var1"] > 10 ]

Unnamed: 0,var1,var2,var3,var4
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23


In [100]:
df[df["var1"] > 10][["var2","var3"]]

Unnamed: 0,var2,var3
b,8,16
c,18,12
d,30,25
e,36,31
f,28,9
g,24,38


# Two or More Conditional Statements

In [101]:
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [102]:
df[(df["var1"] > 10) & (df["var1"] < 20)]

Unnamed: 0,var1,var2,var3,var4
b,19,8,16,1
c,13,18,12,16


In [105]:
df[(df["a":"e"] > 20)]

Unnamed: 0,var1,var2,var3,var4
a,,,39.0,
b,,,,
c,,,,
d,34.0,30.0,25.0,37.0
e,,36.0,31.0,
f,,,,
g,,,,
h,,,,


In [107]:
df["a":"e"]>10

Unnamed: 0,var1,var2,var3,var4
a,False,True,True,False
b,True,False,True,False
c,True,True,True,True
d,True,True,True,True
e,True,True,True,True


In [113]:
df[(df["a":"e"] < 10 ) | (df["var1":"var3"] < 10)]

Unnamed: 0,var1,var2,var3,var4
a,8.0,,,
b,,8.0,,1.0
c,,,,
d,,,,
e,,,,
f,,,,
g,,,,
h,,,,


In [114]:
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [119]:
deneme=df["a":"c"][["var1","var2"]]

In [120]:
deneme

Unnamed: 0,var1,var2
a,8,11
b,19,8
c,13,18


In [121]:
deneme.mean()

var1    13.333333
var2    12.333333
dtype: float64

In [127]:
deneme.max()

var1    19
var2    18
dtype: int32

In [128]:
deneme.min()

var1    8
var2    8
dtype: int32

# Conditional Selection Using .loc[ ] and .iloc[ ]

In [103]:
df.loc[(df["var1"] > 10),["var2","var3"]]

Unnamed: 0,var2,var3
b,8,16
c,18,12
d,30,25
e,36,31
f,28,9
g,24,38


In [129]:
df.loc[((df["var1"] < 10 ) | (df["var1"] > 30 )), ["var2","var3"]]

Unnamed: 0,var2,var3
a,11,39
d,30,25


# reset_index() & set_index()

In [130]:
df

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [131]:
df.reset_index()

Unnamed: 0,index,var1,var2,var3,var4
0,a,8,11,39,10
1,b,19,8,16,1
2,c,13,18,12,16
3,d,34,30,25,37
4,e,20,36,31,11
5,f,21,28,9,23
6,g,27,24,38,23
7,h,10,3,19,29


In [132]:
df.reset_index(drop=True)

Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [133]:
df.reset_index(drop=True , inplace=True)
df

Unnamed: 0,var1,var2,var3,var4
0,8,11,39,10
1,19,8,16,1
2,13,18,12,16
3,34,30,25,37
4,20,36,31,11
5,21,28,9,23
6,27,24,38,23
7,10,3,19,29


In [134]:
df.set_index("var4")

Unnamed: 0_level_0,var1,var2,var3
var4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,8,11,39
1,19,8,16
16,13,18,12
37,34,30,25
11,20,36,31
23,21,28,9
23,27,24,38
29,10,3,19


In [135]:
df.set_index("var4", inplace=True)

In [136]:
df

Unnamed: 0_level_0,var1,var2,var3
var4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,8,11,39
1,19,8,16
16,13,18,12
37,34,30,25
11,20,36,31
23,21,28,9
23,27,24,38
29,10,3,19


# Multi-Index & Index Hierarchy

In [137]:
outside=["M1","M1","M1","M2","M2","M2","M3","M3","M3"]
inside = [1,2,3,1,2,3,5,6,7]
multi_index = list(zip(outside,inside))
multi_index

[('M1', 1),
 ('M1', 2),
 ('M1', 3),
 ('M2', 1),
 ('M2', 2),
 ('M2', 3),
 ('M3', 5),
 ('M3', 6),
 ('M3', 7)]

In [138]:
hier_index= pd.MultiIndex.from_tuples(multi_index)
hier_index

MultiIndex([('M1', 1),
            ('M1', 2),
            ('M1', 3),
            ('M2', 1),
            ('M2', 2),
            ('M2', 3),
            ('M3', 5),
            ('M3', 6),
            ('M3', 7)],
           )

In [139]:
np.random.seed(101)
df=pd.DataFrame(np.random.randn(9,4), index = hier_index , columns=["A","B","C","D"])
df

Unnamed: 0,Unnamed: 1,A,B,C,D
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119
M3,5,-0.134841,0.390528,0.166905,0.184502
M3,6,0.807706,0.07296,0.638787,0.329646
M3,7,-0.497104,-0.75407,-0.943406,0.484752


In [140]:
df.index.names

FrozenList([None, None])

In [146]:
df.index.names=["Group","Num"]
df.index.names


FrozenList(['Group', 'Num'])

In [160]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119
M3,5,-0.134841,0.390528,0.166905,0.184502
M3,6,0.807706,0.07296,0.638787,0.329646
M3,7,-0.497104,-0.75407,-0.943406,0.484752


In [147]:
df.index

MultiIndex([('M1', 1),
            ('M1', 2),
            ('M1', 3),
            ('M2', 1),
            ('M2', 2),
            ('M2', 3),
            ('M3', 5),
            ('M3', 6),
            ('M3', 7)],
           names=['Group', 'Num'])

In [148]:
df.index.levels

FrozenList([['M1', 'M2', 'M3'], [1, 2, 3, 5, 6, 7]])

In [149]:
df.index.get_level_values(0)

Index(['M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M3', 'M3', 'M3'], dtype='object', name='Group')

In [150]:
df.index.get_level_values("Group")

Index(['M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M3', 'M3', 'M3'], dtype='object', name='Group')

In [161]:
df.index.get_level_values(1)

Int64Index([1, 2, 3, 1, 2, 3, 5, 6, 7], dtype='int64', name='Num')

In [162]:
df.index.get_level_values("Num")

Int64Index([1, 2, 3, 1, 2, 3, 5, 6, 7], dtype='int64', name='Num')

In [163]:
df["A"]

Group  Num
M1     1      2.706850
       2      0.651118
       3     -2.018168
M2     1      0.188695
       2      0.190794
       3      0.302665
M3     5     -0.134841
       6      0.807706
       7     -0.497104
Name: A, dtype: float64

In [151]:
df[["A"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A
Group,Num,Unnamed: 2_level_1
M1,1,2.70685
M1,2,0.651118
M1,3,-2.018168
M2,1,0.188695
M2,2,0.190794
M2,3,0.302665
M3,5,-0.134841
M3,6,0.807706
M3,7,-0.497104


In [152]:
df[["A","B"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
M1,1,2.70685,0.628133
M1,2,0.651118,-0.319318
M1,3,-2.018168,0.740122
M2,1,0.188695,-0.758872
M2,2,0.190794,1.978757
M2,3,0.302665,1.693723
M3,5,-0.134841,0.390528
M3,6,0.807706,0.07296
M3,7,-0.497104,-0.75407


In [181]:
df.loc[["M1"],"A":"C"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,1,2.70685,0.628133,0.907969
M1,2,0.651118,-0.319318,-0.848077
M1,3,-2.018168,0.740122,0.528813


In [202]:
df.loc["M1":"M2","A":"C"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,1,2.70685,0.628133,0.907969
M1,2,0.651118,-0.319318,-0.848077
M1,3,-2.018168,0.740122,0.528813
M2,1,0.188695,-0.758872,-0.933237
M2,2,0.190794,1.978757,2.605967
M2,3,0.302665,1.693723,-1.706086


In [203]:
df.loc["M1":,"B":"D"]

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,1,0.628133,0.907969,0.503826
M1,2,-0.319318,-0.848077,0.605965
M1,3,0.740122,0.528813,-0.589001
M2,1,-0.758872,-0.933237,0.955057
M2,2,1.978757,2.605967,0.683509
M2,3,1.693723,-1.706086,-1.159119
M3,5,0.390528,0.166905,0.184502
M3,6,0.07296,0.638787,0.329646
M3,7,-0.75407,-0.943406,0.484752


In [154]:
df.loc["M1"]

Unnamed: 0_level_0,A,B,C,D
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001


In [155]:
df.loc[("M1",2)]

A    0.651118
B   -0.319318
C   -0.848077
D    0.605965
Name: (M1, 2), dtype: float64

In [156]:
df.loc[[("M1",2)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.651118,-0.319318,-0.848077,0.605965


In [157]:
df.loc["M1","A":"C"]

Unnamed: 0_level_0,A,B,C
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.70685,0.628133,0.907969
2,0.651118,-0.319318,-0.848077
3,-2.018168,0.740122,0.528813


In [158]:
df.loc[[("M1",2)], "A":"C"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1,2,0.651118,-0.319318,-0.848077


In [197]:
df.loc["M1":"M2"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [204]:
df.loc[[("M1",3)],"C":]

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
M1,3,0.528813,-0.589001


In [205]:
df.loc["M1":"M2"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,1,2.70685,0.628133,0.907969,0.503826
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [206]:
df.loc[("M1",2):("M2",3)]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [209]:
df.loc[("M1",2):"M2"]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,2,0.651118,-0.319318,-0.848077,0.605965
M1,3,-2.018168,0.740122,0.528813,-0.589001
M2,1,0.188695,-0.758872,-0.933237,0.955057
M2,2,0.190794,1.978757,2.605967,0.683509
M2,3,0.302665,1.693723,-1.706086,-1.159119


In [210]:
df.loc[[("M2",3), ("M3", 5)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M2,3,0.302665,1.693723,-1.706086,-1.159119
M3,5,-0.134841,0.390528,0.166905,0.184502


# Some Other Useful Methods with Iris Dataset

In [212]:
import seaborn as sns

In [213]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [214]:
df=sns.load_dataset("iris")

In [215]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [216]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [218]:
df.shape

(150, 5)

In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [220]:
df.sample(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
30,4.8,3.1,1.6,0.2,setosa
66,5.6,3.0,4.5,1.5,versicolor
117,7.7,3.8,6.7,2.2,virginica
96,5.7,2.9,4.2,1.3,versicolor


In [221]:
df.describe(include="all")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [222]:
df.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.11757,0.871754,0.817941
sepal_width,-0.11757,1.0,-0.42844,-0.366126
petal_length,0.871754,-0.42844,1.0,0.962865
petal_width,0.817941,-0.366126,0.962865,1.0


In [225]:
df.corr()[["sepal_length"]]

Unnamed: 0,sepal_length
sepal_length,1.0
sepal_width,-0.11757
petal_length,0.871754
petal_width,0.817941


In [227]:
df["sepal_length"].corr(df["petal_width"])

0.8179411262715757

In [228]:
df.species.value_counts

<bound method IndexOpsMixin.value_counts of 0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object>

In [229]:
df.species.value_counts(dropna=False)

setosa        50
virginica     50
versicolor    50
Name: species, dtype: int64

In [232]:
df["species"].value_counts(dropna=False,  normalize=True )

setosa        0.333333
virginica     0.333333
versicolor    0.333333
Name: species, dtype: float64

In [233]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [234]:
df.species.nunique()

3

In [235]:
df.loc[df["species"]=="setosa","sepal_length"]

0     5.1
1     4.9
2     4.7
3     4.6
4     5.0
5     5.4
6     4.6
7     5.0
8     4.4
9     4.9
10    5.4
11    4.8
12    4.8
13    4.3
14    5.8
15    5.7
16    5.4
17    5.1
18    5.7
19    5.1
20    5.4
21    5.1
22    4.6
23    5.1
24    4.8
25    5.0
26    5.0
27    5.2
28    5.2
29    4.7
30    4.8
31    5.4
32    5.2
33    5.5
34    4.9
35    5.0
36    5.5
37    4.9
38    4.4
39    5.1
40    5.0
41    4.5
42    4.4
43    5.0
44    5.1
45    4.8
46    5.1
47    4.6
48    5.3
49    5.0
Name: sepal_length, dtype: float64

In [236]:
df[(df.sepal_length > 4 ) & (df.sepal_length < 5 )]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa
11,4.8,3.4,1.6,0.2,setosa
12,4.8,3.0,1.4,0.1,setosa
13,4.3,3.0,1.1,0.1,setosa
22,4.6,3.6,1.0,0.2,setosa


In [238]:
df[(df.species == "virginica") & (df.sepal_length > 4)  & (df.sepal_length < 5)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
106,4.9,2.5,4.5,1.7,virginica


In [239]:
df.sort_values(by="sepal_length",ascending=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
42,4.4,3.2,1.3,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
...,...,...,...,...,...
122,7.7,2.8,6.7,2.0,virginica
118,7.7,2.6,6.9,2.3,virginica
117,7.7,3.8,6.7,2.2,virginica
135,7.7,3.0,6.1,2.3,virginica
