# Creating a series

Series was a data structure used for single dimentional data

In [48]:
import pandas as pd
import numpy as np
from pandas import Series

In [49]:
s = Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [50]:
# to get values from the series
print(s.values)

# to get the indexes
print(s.index)

[1 2 3 4 5]
RangeIndex(start=0, stop=5, step=1)


In [51]:
# create your own index
s = Series([100,200,300], index=[1,2,3])
s

1    100
2    200
3    300
dtype: int64

In [52]:
# create a series from a Python dictionary
dic = {'a':2, 'b':4, 'c': 5}
s = Series(dic)
s

a    2
b    4
c    5
dtype: int64

# Creat a DataFrame

* When we need to work with multi-dimensional data we need to use another data structure called the data frame
* The definition of data frame is: Tabular spreadsheet like data structure which contains ordered collection of columns which can be of different data types

In [53]:
from pandas import DataFrame

In [54]:
people = {'name':['Jim','Rob','Tom'],'age':[22,44,55],'location':['US','UK','AUS']}
df = pd.DataFrame(people)
df

Unnamed: 0,name,age,location
0,Jim,22,US
1,Rob,44,UK
2,Tom,55,AUS


In [55]:
df.location

0     US
1     UK
2    AUS
Name: location, dtype: object

In [56]:
# Get details of a person
df.loc[0]

name        Jim
age          22
location     US
Name: 0, dtype: object

In [57]:
# add another column 
df = pd.DataFrame(people, columns=['name','location','age','gmail'])
df

Unnamed: 0,name,location,age,gmail
0,Jim,US,22,
1,Rob,UK,44,
2,Tom,AUS,55,


In [58]:
gmail = Series(['jim@gmail.com', 'Rob@gmail.com', 'Tom@gamil.com'])
df.gmail = gmail
df

Unnamed: 0,name,location,age,gmail
0,Jim,US,22,jim@gmail.com
1,Rob,UK,44,Rob@gmail.com
2,Tom,AUS,55,Tom@gamil.com


# ReIndexing
In real world projects we will be reading data from different sources and store them into a dataframe. Many a times the by default index value we get wont be as appropriate -> re-index

In [59]:
s = pd.Series([10,20,30,40,50], index = ['a','b','c','d','e'])
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [60]:
# re-index
s = s.reindex(['c','d','e','a','b'])
s

c    30
d    40
e    50
a    10
b    20
dtype: int64

## Operations with fill values

In [61]:
#create two series with different sizes
s1 = pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
s2 = pd.Series([1,2,3,4,5,6,7,8], index =['a','b','c','d','e','f','g','h'])

In [62]:
s1 + s2

a     2.0
b     4.0
c     6.0
d     8.0
e    10.0
f     NaN
g     NaN
h     NaN
dtype: float64

In [63]:
d1 = pd.DataFrame(np.arange(12).reshape(3,4),columns=['a','b','c','d'])
d2 = pd.DataFrame(np.arange(16).reshape(4,4),columns=['a','b','c','d'])
d1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [64]:
d1 + d2

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,6.0
1,8.0,10.0,12.0,14.0
2,16.0,18.0,20.0,22.0
3,,,,


In [65]:
# to avoid getting such undefined values, instead of using normal addition we use
# arithmatic method with fill values

s1.add(s2, fill_value=0)
print(s1)

d1.add(d2, fill_value=0)

a    1
b    2
c    3
d    4
e    5
dtype: int64


Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,6.0
1,8.0,10.0,12.0,14.0
2,16.0,18.0,20.0,22.0
3,12.0,13.0,14.0,15.0


## Adding Series & Dataframe

In [66]:
s = pd.Series([1,2,3,4])

d = pd.DataFrame(np.arange(8).reshape(2,4))

s

0    1
1    2
2    3
3    4
dtype: int64

In [67]:
d - s

Unnamed: 0,0,1,2,3
0,-1,-1,-1,-1
1,3,3,3,3


In [68]:
d+s

Unnamed: 0,0,1,2,3
0,1,3,5,7
1,5,7,9,11


Meaning even if the series is only one-dimensional and data frame is two dimensional, it will broadcast the series onto the dataframe when the subtraction or addition operation takes place.

## Function Application & Mapping

In [69]:
d = pd.DataFrame(np.arange(16).reshape(4,4))

f = lambda x: x + 10
a = d.apply(f)
a

Unnamed: 0,0,1,2,3
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [70]:
d.apply(lambda x: x*x)

Unnamed: 0,0,1,2,3
0,0,1,4,9
1,16,25,36,49
2,64,81,100,121
3,144,169,196,225


## Sorting & Ranking
Sorting and ranking means arranging the indexes in ascending or descending order. Sometimes you might get a dataset when index values are not properly sorted

In [71]:
s = pd.Series([1,2,3,4], index=['d','c','b','a'])
s

d    1
c    2
b    3
a    4
dtype: int64

In [73]:
s.sort_index()

a    4
b    3
c    2
d    1
dtype: int64

In [74]:
df = pd.DataFrame(np.arange(16).reshape(4,4), index=[3,1,2,4], columns=['d','c','b','a'])
df

Unnamed: 0,d,c,b,a
3,0,1,2,3
1,4,5,6,7
2,8,9,10,11
4,12,13,14,15


In [75]:
df.sort_index()

Unnamed: 0,d,c,b,a
1,4,5,6,7
2,8,9,10,11
3,0,1,2,3
4,12,13,14,15


In [76]:
df.sort_index(axis=1)

Unnamed: 0,a,b,c,d
3,3,2,1,0
1,7,6,5,4
2,11,10,9,8
4,15,14,13,12


In [77]:
df = pd.DataFrame({'name':['phone','laptop','tablet'],'prices':[200,300,400]})
df

Unnamed: 0,name,prices
0,phone,200
1,laptop,300
2,tablet,400


In [78]:
df.sort_values(by='prices')

Unnamed: 0,name,prices
0,phone,200
1,laptop,300
2,tablet,400


## Finding Dupliacte index values
In some case when you read data from a dataset the dataset might contain some duplicate index values

In [79]:
# working with series
s = pd.Series([1,2,3,4,5],index=['a','b','c','d','a'])
s

a    1
b    2
c    3
d    4
a    5
dtype: int64

In [80]:
s.index.is_unique

False

In [82]:
# working with dataframe
df = pd.DataFrame(np.arange(6).reshape(2,3), columns=[1,2,3], index=['a','a'])
df

Unnamed: 0,1,2,3
a,0,1,2
a,3,4,5


In [83]:
df.index.is_unique

False