# Pandas
---

Pandas is a data analysis tool, built on top of numpy.

In [1]:
import pandas as pd

In [2]:
names = ["Nepal", "India", "Bhutan"]
codes = [977, 81, 123]

In [3]:
list(zip(names, codes))

[('Nepal', 977), ('India', 81), ('Bhutan', 123)]

**zip: ** *zip creates collection of tuples out of each elements of given lists*

In [4]:
zip([1, 2, 3], [11, 12, 13], [21, 22, 23])

<zip at 0x7fcf38425f88>

In [5]:
list(zip([1, 2, 3], [11, 12, 13], [21, 22, 23]))

[(1, 11, 21), (2, 12, 22), (3, 13, 23)]

*We create a dataset with names and codes*

In [6]:
dataset = list(zip(names, codes))

In [7]:
dataset

[('Nepal', 977), ('India', 81), ('Bhutan', 123)]

**Now we create a dataframe**

Dataframe is generally a tabular data with rows and columns ( similar to that of excel ).

In [8]:
df = pd.DataFrame(data=dataset, columns=["Name", "Code"])

In [9]:
df

Unnamed: 0,Name,Code
0,Nepal,977
1,India,81
2,Bhutan,123


*Here, each row defines a unique set of observation or condition, while each column defines parameters for observations*

*In above dataframe, Name and Code are parameters we are using to define a country and each row i.e Nepal and 977 collectively gives us unique set*

*__0__, __1__ .. are called index in pandas, they are generated while creating a dataframe*

*Let's create a large dataset*

In [10]:
import numpy as np

In [11]:
names = ['Bob', 'Jessica', 'Mary', 'John', 'Mel']

In [12]:
np.random.randint(low=0, high=len(names))

1

In [13]:
random_names = [names[np.random.randint(low=0, high=len(names))] 
                for i in range(1000)]

In [14]:
random_names[:10]

['Jessica',
 'Mary',
 'Bob',
 'Mary',
 'Mary',
 'Bob',
 'Bob',
 'Bob',
 'Jessica',
 'Mel']

In [15]:
ages = [np.random.randint(low=1, high=100) for i in range(1000)]

In [16]:
ages[:10]

[59, 68, 2, 84, 85, 49, 19, 67, 39, 29]

In [17]:
df = pd.DataFrame(list(zip(random_names, ages)), 
                  columns=["Name", "Age"])

In [18]:
df

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85
5,Bob,49
6,Bob,19
7,Bob,67
8,Jessica,39
9,Mel,29


**Overview of data**

*show first 5 rows*

In [19]:
df.head()

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85


*or 10*

In [20]:
df.head(10)

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85
5,Bob,49
6,Bob,19
7,Bob,67
8,Jessica,39
9,Mel,29


*show last 5 rows*

In [21]:
df.tail()

Unnamed: 0,Name,Age
995,Mary,51
996,Jessica,1
997,Jessica,48
998,Bob,21
999,Bob,40


*or 10*

In [22]:
df.tail(10)

Unnamed: 0,Name,Age
990,Mel,77
991,Mary,28
992,Jessica,75
993,Bob,45
994,Jessica,34
995,Mary,51
996,Jessica,1
997,Jessica,48
998,Bob,21
999,Bob,40


### Selecting rows and columns

**select single column from dataframe**

In [23]:
df["Name"]

0      Jessica
1         Mary
2          Bob
3         Mary
4         Mary
5          Bob
6          Bob
7          Bob
8      Jessica
9          Mel
10     Jessica
11         Mel
12        Mary
13        Mary
14     Jessica
15        Mary
16     Jessica
17        John
18         Mel
19     Jessica
20         Mel
21         Mel
22        Mary
23     Jessica
24         Mel
25        John
26     Jessica
27         Mel
28         Mel
29     Jessica
        ...   
970       Mary
971       Mary
972        Bob
973        Bob
974        Mel
975       Mary
976    Jessica
977       Mary
978        Bob
979       Mary
980    Jessica
981       Mary
982        Bob
983    Jessica
984       Mary
985       John
986       John
987       Mary
988        Bob
989       Mary
990        Mel
991       Mary
992    Jessica
993        Bob
994    Jessica
995       Mary
996    Jessica
997    Jessica
998        Bob
999        Bob
Name: Name, dtype: object

*Note: such selection which gave us rows of a single column, are called Series in pandas*

**Select multiple columns from dataframe**

In [24]:
df[["Name", "Age"]].head()

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85


*we are selecting multiple columns with list of columns ["Name", "Age"]*

**Select row from dataframe**

*Select from index*

In [25]:
df.ix[0]

Name    Jessica
Age          59
Name: 0, dtype: object

> index can be either integers as above or any string, depends on dataframe

*Select from location*

In [26]:
df.iloc[0]

Name    Jessica
Age          59
Name: 0, dtype: object

> it is integer based location

*Select from location*

In [27]:
df.loc[0]

Name    Jessica
Age          59
Name: 0, dtype: object

*Slicing based selection*

In [28]:
df.loc[0:6]

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85
5,Bob,49
6,Bob,19


In [29]:
df.iloc[0:6]

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85
5,Bob,49


**Behavior of ix, loc and iloc**

In [30]:
# let's create a new dataframe with selected rows
ndf = df.loc[10:20]

In [31]:
ndf

Unnamed: 0,Name,Age
10,Jessica,76
11,Mel,55
12,Mary,20
13,Mary,84
14,Jessica,37
15,Mary,50
16,Jessica,34
17,John,38
18,Mel,62
19,Jessica,88


In [32]:
ndf.ix[0]

KeyError: 0

In [33]:
ndf.loc[0]

KeyError: 'the label [0] is not in the [index]'

In [34]:
ndf.iloc[0]

Name    Jessica
Age          76
Name: 10, dtype: object

In [35]:
ndf.loc[10]

Name    Jessica
Age          76
Name: 10, dtype: object

In [37]:
mdf = pd.DataFrame([('Apple', 42, 56), ('Orange', 42, 56), 
                    ('Pineapple', 42, 56), ('Lime', 42, 56)], 
                   columns=["Name", "Price", "Qty"])

In [41]:
mdf = mdf.set_index(["Name"])

In [42]:
mdf

Unnamed: 0_level_0,Price,Qty
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,42,56
Orange,42,56
Pineapple,42,56
Lime,42,56


In [43]:
mdf.ix[0]

Price    42
Qty      56
Name: Apple, dtype: int64

In [44]:
mdf.iloc[0]

Price    42
Qty      56
Name: Apple, dtype: int64

In [45]:
mdf.loc[0]

TypeError: cannot do label indexing on <class 'pandas.indexes.base.Index'> with these indexers [0] of <class 'int'>

In [46]:
mdf.ix["Apple"]

Price    42
Qty      56
Name: Apple, dtype: int64

In [47]:
mdf.loc["Apple"]

Price    42
Qty      56
Name: Apple, dtype: int64

In [48]:
mdf.iloc["Apple"]

TypeError: cannot do positional indexing on <class 'pandas.indexes.base.Index'> with these indexers [Apple] of <class 'str'>

*In above 3 cells, __ix__ and __loc__ are trying to access by index __0__, which is not present in our new dataframe while __iloc__ fetched 1st row or 0th data from our new dataframe*

**Silicing by rows and columns**

In [49]:
# get all data from 0 to 6 row and column "Age"
df.loc[0:6, "Age"]

0    59
1    68
2     2
3    84
4    85
5    49
6    19
Name: Age, dtype: int64

In [50]:
# get all data from 0 to 6 and column "Name" to "Age"
df.loc[0:6, "Name": "Age"]

Unnamed: 0,Name,Age
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85
5,Bob,49
6,Bob,19


In [51]:
# column slicing should be in sequential order to that 
# of columns of dataframe
df.loc[0:6, "Age": "Name"]

0
1
2
3
4
5
6


**Changing columns of dataframe**

In [52]:
df.columns = ["Name of people", "Age of people"]

In [53]:
df.head()

Unnamed: 0,Name of people,Age of people
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85


**Filtering basics**

In [54]:
# get all people whose age is less than 10
df[df["Age of people"] < 10]

Unnamed: 0,Name of people,Age of people
2,Bob,2
33,Jessica,8
57,Mel,7
64,John,6
88,Jessica,2
121,Mel,1
127,Jessica,5
146,John,5
185,Mary,3
202,Jessica,1


In [55]:
# get all people whose age is less than 10 or age is greater than 90
df[(df["Age of people"] < 10) | (df["Age of people"] > 90)]

Unnamed: 0,Name of people,Age of people
2,Bob,2
30,John,91
33,Jessica,8
57,Mel,7
59,John,98
64,John,6
74,Jessica,93
78,Mary,91
79,Bob,91
83,Jessica,93


In [56]:
# get mean
df.mean()

Age of people    50.017
dtype: float64

In [57]:
# get max
df.max()

Name of people    Mel
Age of people      99
dtype: object

In [58]:
# get min
df.min()

Name of people    Bob
Age of people       1
dtype: object

In [59]:
# get median
df.median()

Age of people    50.0
dtype: float64

In [60]:
# get standard deviation
df.std()

Age of people    28.470885
dtype: float64

In [61]:
df.head()

Unnamed: 0,Name of people,Age of people
0,Jessica,59
1,Mary,68
2,Bob,2
3,Mary,84
4,Mary,85


In [62]:
df.groupby("Name of people").mean()

Unnamed: 0_level_0,Age of people
Name of people,Unnamed: 1_level_1
Bob,51.486772
Jessica,51.665049
John,50.763285
Mary,47.915423
Mel,48.243655


In [63]:
dir(df)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__invert__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__round__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__