In [1]:
# import pandas, numpy library
import pandas as pd
import numpy as np

Notes:
> pandas is designed for working with tabular data or heterogeneous data.

> numpy is best suited for homogeneously numerical array data

## introduction  to pandas data structures

In [2]:
# there are 2 structures mainly used in most of the tasks

# 1. Series
# 2. DataFrame


### series

In [3]:
# series is a one-dimensional array like sequential of values (same type) and array data labels (index)
# lets create simple series by using array

obj = pd.Series([1,2,3,4])
obj

0    1
1    2
2    3
3    4
dtype: int64

In [4]:
# as you can see above code we did't specify the indexes. its taking default one starts with 0 and ends with N-1
# we can call indexes and array values separately

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj.array

<PandasArray>
[1, 2, 3, 4]
Length: 4, dtype: int64

In [7]:
type(obj.array)

pandas.core.arrays.numpy_.PandasArray

In [8]:
# creating series and own string indexes
obj2 = pd.Series([4,7,-5,3],index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
# creating series and own integer indexes
obj3 = pd.Series([4,7,-5,1],index=[3,2,6,9])
obj3

3    4
2    7
6   -5
9    1
dtype: int64

In [11]:
obj3.index

Int64Index([3, 2, 6, 9], dtype='int64')

In [12]:
obj3.array

<PandasArray>
[4, 7, -5, 1]
Length: 4, dtype: int64

In [13]:
# we can use labels to access the values
obj2['a']

-5

In [14]:
obj2['b']=8
obj2

d    4
b    8
a   -5
c    3
dtype: int64

In [15]:
obj2[['a','b','c']]

a   -5
b    8
c    3
dtype: int64

In [16]:
obj2[obj2 > 0]

d    4
b    8
c    3
dtype: int64

In [17]:
obj2 * 2

d     8
b    16
a   -10
c     6
dtype: int64

In [18]:
np.exp(obj2)

d      54.598150
b    2980.957987
a       0.006738
c      20.085537
dtype: float64

In [19]:
# another way to think about series is fixed array length, ordered dictionary
obj2

d    4
b    8
a   -5
c    3
dtype: int64

In [20]:
'b' in obj2

True

In [21]:
'e' in obj2

False

In [22]:
# create a series from dictionary
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj4 = pd.Series(sdata)
obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [23]:
# A series can convert into back to dictionary by using to_dict() method
dict_sdata = obj4.to_dict()
dict_sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [24]:
# we can change order of results by passing our own order index
states = ['California','Ohio','Oregon','Texas']
obj5 = pd.Series(sdata,index=states)
obj5 # NaN is nothing but missing values(Not a Number)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [25]:
# we have two methods in pandas 1. isna 2. notna to detect or check missing values in series or dataframe
pd.isna(obj5)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [26]:
pd.notna(obj5)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [27]:
# series also have instance methods
obj5.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [28]:
# series have useful feature that is it aligned by index label in arithmatic operations
obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [29]:
obj5

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [30]:
obj4+obj5 # as you can see order by index label automatically

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [31]:
# we can provide one kind of index are we using now.
obj5.index.name = 'states'
obj5.name = 'population'
obj5

states
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [32]:
# series index can also be modified after excuted.
obj

0    1
1    2
2    3
3    4
dtype: int64

In [33]:
obj.index=['Bob','Steve','Jeff','Ryan']
obj

Bob      1
Steve    2
Jeff     3
Ryan     4
dtype: int64

## dataframe

In [34]:
# dataframe is a rectangular data formed by rows and columns
# it contains multiple types of data boolean,int,string, etc...
# there are many ways to create a dataframe. one of the most common form is dictionary of equal length lists or numpy arrays

In [35]:
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002,2003],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [36]:
# for large dataframe we can use head() method to show first five rows.
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [37]:
# simillarly we have tail() method to show last five rows
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [38]:
# if we specify sequence of columns, dataframe columns will be aranged that order
pd.DataFrame(data,columns=['year','pop','state'])

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada
5,2003,3.2,Nevada


In [39]:
# if we pass extra one column that not contains in dataframe. dataframe will take NaN for those column not in columns place
frame2 = pd.DataFrame(data,columns=['state','year','Gender'])
frame2

Unnamed: 0,state,year,Gender
0,Ohio,2000,
1,Ohio,2001,
2,Ohio,2002,
3,Nevada,2001,
4,Nevada,2002,
5,Nevada,2003,


In [40]:
frame2.columns

Index(['state', 'year', 'Gender'], dtype='object')

In [41]:
# we can retrieved specific column in dataframe as a series by usning notation or dot attribute
print(frame)
print('dataframe type => ',type(frame))
print()
print(frame2['year'])
print('dataframe column type => ',type(frame2['year']))

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
dataframe type =>  <class 'pandas.core.frame.DataFrame'>

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
dataframe column type =>  <class 'pandas.core.series.Series'>


In [42]:
frame2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [43]:
# we can retrieved specific values in rows also by using special attribute
# 1. loc
# 2. iloc

In [44]:
frame2

Unnamed: 0,state,year,Gender
0,Ohio,2000,
1,Ohio,2001,
2,Ohio,2002,
3,Nevada,2001,
4,Nevada,2002,
5,Nevada,2003,


In [45]:
frame2.loc[1]

state     Ohio
year      2001
Gender     NaN
Name: 1, dtype: object

In [46]:
frame2.iloc[1]

state     Ohio
year      2001
Gender     NaN
Name: 1, dtype: object

In [47]:
frame2.iloc[2]

state     Ohio
year      2002
Gender     NaN
Name: 2, dtype: object

In [48]:
# columns can be modified by assignment. for example, the empty debt column could assign scaler values or array values

frame2['debt']=12.5
frame2

Unnamed: 0,state,year,Gender,debt
0,Ohio,2000,,12.5
1,Ohio,2001,,12.5
2,Ohio,2002,,12.5
3,Nevada,2001,,12.5
4,Nevada,2002,,12.5
5,Nevada,2003,,12.5


In [49]:
frame2['debt'] = np.arange(6)
frame2

Unnamed: 0,state,year,Gender,debt
0,Ohio,2000,,0
1,Ohio,2001,,1
2,Ohio,2002,,2
3,Nevada,2001,,3
4,Nevada,2002,,4
5,Nevada,2003,,5


In [50]:
# when we assigning array or list value are in column, length should be same length of the dataframe.
# if we assign a series,
val = pd.Series([-1.2,-1.5,-1.7],index=[2,4,5])
frame2['crdt'] = val
frame2

Unnamed: 0,state,year,Gender,debt,crdt
0,Ohio,2000,,0,
1,Ohio,2001,,1,
2,Ohio,2002,,2,-1.2
3,Nevada,2001,,3,
4,Nevada,2002,,4,-1.5
5,Nevada,2003,,5,-1.7


In [50]:
# assigning column doesn't exis will create new column

In [None]:
# A  del keyword used to delete the columns in dataframe like dictionary, as an example i will add boolean column
# condtion where state column is equal to 'Ohio'

In [51]:
frame['estern'] = states == 'Ohio'