## Introduction to Pandas

###### checking the version of pandas

In [1]:
import pandas
pandas.__version__

'1.0.5'

## Introducing Pandas Objects


In [2]:
import numpy as np
import pandas as pd

### Series
A Series is a single vector of data (like a NumPy array) with an index that labels each element in the vector.5

In [3]:
data = pd.Series([456, 536, 7856, 225])
data

0     456
1     536
2    7856
3     225
dtype: int64

In [4]:
data.values

array([ 456,  536, 7856,  225], dtype=int64)

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
fruits = pd.Series([586, 145, 2536, 856], 
    index=['apple', 'mango', 'orange', 'banana'])

fruits

apple      586
mango      145
orange    2536
banana     856
dtype: int64

In [7]:
fruits['mango']

145

In [8]:
fruits[2]

2536

In [9]:
fruits.name = 'price'
fruits.index.name = 'fruits'
fruits

fruits
apple      586
mango      145
orange    2536
banana     856
Name: price, dtype: int64

In [10]:
np.log(fruits)

fruits
apple     6.373320
mango     4.976734
orange    7.838343
banana    6.752270
Name: price, dtype: float64

In [11]:
fruits[fruits>1000]

fruits
orange    2536
Name: price, dtype: int64

In [12]:
fruits_dict = {'apple': 586, 'mango': 145, 'orange': 2536, 'banana': 856}
print(fruits_dict)
pd.Series(fruits_dict)


{'apple': 586, 'mango': 145, 'orange': 2536, 'banana': 856}


apple      586
mango      145
orange    2536
banana     856
dtype: int64

## DataFrame: bi-dimensional Series with two (or more) indices

In [13]:
data = {"animals": ["cat", "dog", "tiger", "cat", "lion"],
        "place": ['forest', 'house', 'cage', 'park', 'forest'],
        "rating": [10, 50, 10, 80, 90],"price":[78,87,90,99,110]}
print(data)
data = pd.DataFrame(data)
data

{'animals': ['cat', 'dog', 'tiger', 'cat', 'lion'], 'place': ['forest', 'house', 'cage', 'park', 'forest'], 'rating': [10, 50, 10, 80, 90], 'price': [78, 87, 90, 99, 110]}


Unnamed: 0,animals,place,rating,price
0,cat,forest,10,78
1,dog,house,50,87
2,tiger,cage,10,90
3,cat,park,80,99
4,lion,forest,90,110


In [14]:
df = pd.DataFrame(data, columns=["place", "animals" ,"rating","price"])
df

Unnamed: 0,place,animals,rating,price
0,forest,cat,10,78
1,house,dog,50,87
2,cage,tiger,10,90
3,park,cat,80,99
4,forest,lion,90,110


In [15]:
df['entertainment'] = df.price / df.rating
df

Unnamed: 0,place,animals,rating,price,entertainment
0,forest,cat,10,78,7.8
1,house,dog,50,87,1.74
2,cage,tiger,10,90,9.0
3,park,cat,80,99,1.2375
4,forest,lion,90,110,1.222222


In [16]:
df['Serie_no'] = pd.Series(range(5), index=[0,1,2,3,4])
df

Unnamed: 0,place,animals,rating,price,entertainment,Serie_no
0,forest,cat,10,78,7.8,0
1,house,dog,50,87,1.74,1
2,cage,tiger,10,90,9.0,2
3,park,cat,80,99,1.2375,3
4,forest,lion,90,110,1.222222,4


In [17]:
df.to_dict()

{'place': {0: 'forest', 1: 'house', 2: 'cage', 3: 'park', 4: 'forest'},
 'animals': {0: 'cat', 1: 'dog', 2: 'tiger', 3: 'cat', 4: 'lion'},
 'rating': {0: 10, 1: 50, 2: 10, 3: 80, 4: 90},
 'price': {0: 78, 1: 87, 2: 90, 3: 99, 4: 110},
 'entertainment': {0: 7.8, 1: 1.74, 2: 9.0, 3: 1.2375, 4: 1.2222222222222223},
 'Serie_no': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}}

In [18]:
pd.DataFrame(df.to_dict())

Unnamed: 0,place,animals,rating,price,entertainment,Serie_no
0,forest,cat,10,78,7.8,0
1,house,dog,50,87,1.74,1
2,cage,tiger,10,90,9.0,2
3,park,cat,80,99,1.2375,3
4,forest,lion,90,110,1.222222,4


### DataFrame as specialized dictionary
Similarly, we can also think of a DataFrame as a specialization of dictionary. Where a dictionary maps a key to a value, a DataFrame maps a column name to a Series of column data. For example, asking for the 'area' attribute returns the Series object containing the areas we saw earlier:



##### From a list of dicts
Any list of dictionaries can be made into a DataFrame

In [19]:
data = [{'x': i, 'y':20* i}for i in range(8)]
print(data)
pd.DataFrame(data)

[{'x': 0, 'y': 0}, {'x': 1, 'y': 20}, {'x': 2, 'y': 40}, {'x': 3, 'y': 60}, {'x': 4, 'y': 80}, {'x': 5, 'y': 100}, {'x': 6, 'y': 120}, {'x': 7, 'y': 140}]


Unnamed: 0,x,y
0,0,0
1,1,20
2,2,40
3,3,60
4,4,80
5,5,100
6,6,120
7,7,140


In [20]:
pd.DataFrame([{'apple': 50, 'cabege': 25}, {'cabege': 35, 'orange': 60}])

Unnamed: 0,apple,cabege,orange
0,50.0,25,
1,,35,60.0


### From a two-dimensional NumPy array

In [21]:
pd.DataFrame(np.random.randint(4, 52),
             columns=['one', 'two'],
             index=['a', 'b', 'c'])



Unnamed: 0,one,two
a,43,43
b,43,43
c,43,43


## The Pandas Index Object

In [22]:
ax = pd.Index(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'])
ax

Index(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], dtype='object')

### Index as immutable array

In [23]:
ax[2]

'cc'

In [24]:
ax[:]

Index(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], dtype='object')

In [25]:
ax[2:3]

Index(['cc'], dtype='object')

In [26]:
ax[::]

Index(['aa', 'bb', 'cc', 'dd', 'ee', 'ff'], dtype='object')

In [27]:
print(ax.size,ax.ndim,ax.shape,ax.dtype)

6 1 (6,) object


In [28]:
ax[2]='hh'

TypeError: Index does not support mutable operations

## Operating on Data in Pandas

### Ufuncs: Index Preservation
Because Pandas is designed to work with NumPy, any NumPy ufunc will work on Pandas Series and DataFrame objects. Let's start by defining a simple Series and DataFrame on which to demonstrate this:

In [29]:
line = np.random.RandomState(26)
row = pd.Series(line.randint(0, 25, 4))
row

0    21
1     6
2    16
3     1
dtype: int32

In [30]:
don = pd.DataFrame(line.randint(0, 25, (8, 6)),
                  columns=[1,2,3,4,5,6])
don

Unnamed: 0,1,2,3,4,5,6
0,6,19,13,0,4,2
1,13,17,23,19,21,20
2,14,24,21,15,10,14
3,23,6,15,24,18,15
4,21,12,15,16,12,18
5,12,19,19,19,3,17
6,12,17,20,16,6,8
7,0,14,0,23,13,6


In [31]:
np.exp(row)

0    1.318816e+09
1    4.034288e+02
2    8.886111e+06
3    2.718282e+00
dtype: float64

In [32]:
np.cos(don * np.pi / 5)

Unnamed: 0,1,2,3,4,5,6
0,-0.809017,0.809017,-0.309017,1.0,-0.809017,0.309017
1,-0.309017,-0.309017,-0.309017,0.809017,0.809017,1.0
2,-0.809017,-0.809017,0.809017,-1.0,1.0,-0.809017
3,-0.309017,-0.809017,-1.0,-0.809017,0.309017,-1.0
4,0.809017,0.309017,-1.0,-0.809017,0.309017,0.309017
5,0.309017,0.809017,0.809017,0.809017,-0.309017,-0.309017
6,0.309017,-0.309017,1.0,-0.809017,-0.809017,0.309017
7,1.0,-0.809017,1.0,-0.309017,-0.309017,-0.809017


In [33]:
np.sin(don*np.pi/5)

Unnamed: 0,1,2,3,4,5,6
0,-0.587785,-0.587785,0.9510565,0.0,0.5877853,0.9510565
1,0.951057,-0.951057,0.9510565,-0.5877853,0.5877853,-4.898587e-16
2,0.587785,0.587785,0.5877853,3.67394e-16,-2.449294e-16,0.5877853
3,0.951057,-0.587785,3.67394e-16,0.5877853,-0.9510565,3.67394e-16
4,0.587785,0.951057,3.67394e-16,-0.5877853,0.9510565,-0.9510565
5,0.951057,-0.587785,-0.5877853,-0.5877853,0.9510565,-0.9510565
6,0.951057,-0.951057,-4.898587e-16,-0.5877853,-0.5877853,-0.9510565
7,0.0,0.587785,0.0,0.9510565,0.9510565,-0.5877853


## Universal Functions: Index Alignment
For binary operations on two Series or DataFrame objects, Pandas will align indices in the process of performing the operation. This is very convenient when working with incomplete data, as we'll see in some of the examples that follow.

### Index alignment in Series

In [34]:
 IPL_Team= pd.Series({'Mumbai Indians': 14, 'Delhi Capitals': 13,
                  'Royal Challengers Bangalore': 14, 'Kings XI Punjab': 12}, name='IPL_Team')
points = pd.Series({'Mumbai Indians': 14, 'Delhi Capitals': 13,
                  'Royal Challengers Bangalore': 14, 'Kings XI Punjab': 12}, name='points')
print(IPL_Team)
points


Mumbai Indians                 14
Delhi Capitals                 13
Royal Challengers Bangalore    14
Kings XI Punjab                12
Name: IPL_Team, dtype: int64


Mumbai Indians                 14
Delhi Capitals                 13
Royal Challengers Bangalore    14
Kings XI Punjab                12
Name: points, dtype: int64

In [35]:
points / IPL_Team

Mumbai Indians                 1.0
Delhi Capitals                 1.0
Royal Challengers Bangalore    1.0
Kings XI Punjab                1.0
dtype: float64

In [36]:
IPL_Team.index | points.index

Index(['Mumbai Indians', 'Delhi Capitals', 'Royal Challengers Bangalore',
       'Kings XI Punjab'],
      dtype='object')

In [37]:
name1 = pd.Series(['aa', 'bb', 'cc', 'dd'], index=[0, 1, 2, 3])
name2 = pd.Series(['ee', 'ff', 'gg', 'hh'], index=[1, 2, 3, 4])
print(name1)
print(name2)
name2
name1 + name2 

0    aa
1    bb
2    cc
3    dd
dtype: object
1    ee
2    ff
3    gg
4    hh
dtype: object


0     NaN
1    bbee
2    ccff
3    ddgg
4     NaN
dtype: object

In [38]:
name1.add(name2, fill_value='kk')

0    aakk
1    bbee
2    ccff
3    ddgg
4    kkhh
dtype: object

## Ufuncs: Operations Between DataFrame and Series

## Data wrangling

Getting the data in the shape that we want is the single most time consuming task in the life of the Data Scientist. Sometimes it can be the most frustrating.

## Merge operations
By merging we mean combining different data sets by linking rows with one or more keys

In [39]:
df

Unnamed: 0,place,animals,rating,price,entertainment,Serie_no
0,forest,cat,10,78,7.8,0
1,house,dog,50,87,1.74,1
2,cage,tiger,10,90,9.0,2
3,park,cat,80,99,1.2375,3
4,forest,lion,90,110,1.222222,4


In [40]:
df2 = pd.DataFrame({"animals": ["cat", "dog", "tiger", "lion"], "strngth": [10, 50, 10, 90]})
df2

Unnamed: 0,animals,strngth
0,cat,10
1,dog,50
2,tiger,10
3,lion,90


In [41]:
df.merge(df2)

Unnamed: 0,place,animals,rating,price,entertainment,Serie_no,strngth
0,forest,cat,10,78,7.8,0,10
1,park,cat,80,99,1.2375,3,10
2,house,dog,50,87,1.74,1,50
3,cage,tiger,10,90,9.0,2,10
4,forest,lion,90,110,1.222222,4,90


In [42]:
df3 = pd.DataFrame({"animals": ["cat", "tiger"], "strength": ["10", "50"]})
df3
df.merge(df3, right_on='animals', left_on='animals')

Unnamed: 0,place,animals,rating,price,entertainment,Serie_no,strength
0,forest,cat,10,78,7.8,0,10
1,park,cat,80,99,1.2375,3,10
2,cage,tiger,10,90,9.0,2,50


In [43]:
df4 = pd.DataFrame({"animals": ["cat", "tiger"], "Population": ["10", "50"]})
df.merge(df4, how='outer')

Unnamed: 0,place,animals,rating,price,entertainment,Serie_no,Population
0,forest,cat,10,78,7.8,0,10.0
1,park,cat,80,99,1.2375,3,10.0
2,house,dog,50,87,1.74,1,
3,cage,tiger,10,90,9.0,2,50.0
4,forest,lion,90,110,1.222222,4,


In [44]:
df5 = pd.DataFrame({"animals": ["cat", "dog", "lion"], "strength": ["10", "80", "50"]})
print(df)
df.merge(df5, how='outer')

    place animals  rating  price  entertainment  Serie_no
0  forest     cat      10     78       7.800000         0
1   house     dog      50     87       1.740000         1
2    cage   tiger      10     90       9.000000         2
3    park     cat      80     99       1.237500         3
4  forest    lion      90    110       1.222222         4


Unnamed: 0,place,animals,rating,price,entertainment,Serie_no,strength
0,forest,cat,10,78,7.8,0,10.0
1,park,cat,80,99,1.2375,3,10.0
2,house,dog,50,87,1.74,1,80.0
3,cage,tiger,10,90,9.0,2,
4,forest,lion,90,110,1.222222,4,50.0


## Combining data with overlap

In [45]:
serie_x = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, 5.2],
                     index=[ 'e', 'm', 'd', 'l', 'b', 'a'])
serie_y = pd.Series(np.arange(len(serie_x), dtype=np.float64),
                 index=['e', 'm', 'd', 'l', 'b', 'a'])

In [46]:
serie_x

e    NaN
m    2.5
d    NaN
l    3.5
b    4.5
a    5.2
dtype: float64

In [47]:
serie_y

e    0.0
m    1.0
d    2.0
l    3.0
b    4.0
a    5.0
dtype: float64

In [48]:
pd.Series(np.where(pd.isnull(serie_x), serie_y, serie_x), index=serie_x.index)

e    0.0
m    2.5
d    2.0
l    3.5
b    4.5
a    5.2
dtype: float64

In [49]:
serie_x.combine_first(serie_y)

e    0.0
m    2.5
d    2.0
l    3.5
b    4.5
a    5.2
dtype: float64