# Pandas

## https://pandas.pydata.org/

# Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.

- Series [1D]
- DataFrame [2D]
- Panel [df in df]

In [1]:
import pandas as pd

In [2]:
# check version
pd.__version__

'1.5.3'

## pd.Series(data,index,dtype)

In [3]:
# empty Series
pd.Series()

  pd.Series()


Series([], dtype: float64)

In [4]:
pd.Series?

In [5]:
pd.Series(data=None,index=None,dtype=None)

  pd.Series(data=None,index=None,dtype=None)


Series([], dtype: float64)

In [6]:
# The default dtype for empty Series will be 'object'
pd.Series(dtype=object)

Series([], dtype: object)

# Features of a Series
- Its Mutable in nature
- Array, indexing, slicing default
- 1 Dimensional data structure
- we can manipulate Series
- duplicate data and index is allowed

In [7]:
# Deal with data
k = [10,23.44,'python',56000]
s = pd.Series(k)
s

0        10
1     23.44
2    python
3     56000
dtype: object

In [8]:
# int--> float-->complex--> str/object
pd.Series([1,'2',3.])

0      1
1      2
2    3.0
dtype: object

In [9]:
# series with str values
pd.Series(['A','B','C'])

0    A
1    B
2    C
dtype: object

In [13]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [11]:
pd.Series([1,2,3,4.]) #upcasting

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [12]:
pd.Series([1,2,3,4,5j])

0    1.0+0.0j
1    2.0+0.0j
2    3.0+0.0j
3    4.0+0.0j
4    0.0+5.0j
dtype: complex128

In [14]:
a = pd.Series([1,2,3])
a.__sizeof__()

152

In [15]:
b = pd.Series(['1','2','3'])
b.__sizeof__()

302

In [16]:
a.dtype,b.dtype

(dtype('int64'), dtype('O'))

### Check the properties of Series

In [17]:
s

0        10
1     23.44
2    python
3     56000
dtype: object

In [18]:
#check dimension
s.ndim

1

In [19]:
#check total no. of elements(rows)
s.size

4

In [20]:
len(s)

4

In [22]:
s.count()

4

In [23]:
import numpy as np
t = pd.Series([np.nan,1,2,np.nan])
t
#NaN: Not a Number

0    NaN
1    1.0
2    2.0
3    NaN
dtype: float64

In [24]:
t.count()
# Return number of non-NA/null observations in the Series.

2

In [25]:
t.size # it check for all values NaN as well

4

In [26]:
# check number of rows and columns
# as Series is 1D so we dont have columns
s.shape

(4,)

In [27]:
# check data type
s.dtype

dtype('O')

In [28]:
# check memory usage in bytes
s.memory_usage()

160

In [29]:
s

0        10
1     23.44
2    python
3     56000
dtype: object

In [33]:
# fetch data only
s.values

array([10, 23.44, 'python', 56000], dtype=object)

In [31]:
# fetch index
s.index

RangeIndex(start=0, stop=4, step=1)

### Apply some statistical or Mathematical operations

In [34]:
a = pd.Series([120,1,9000,22,18])
a

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [35]:
# find out max value
a.max()

9000

In [36]:
max(a)

9000

In [37]:
# find out min
a.min()

1

In [38]:
# find out mean/average
a.mean()

1832.2

In [39]:
a

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [40]:
# median
#[4,10,2,1,5,7]--> sort records --> [1,2,4,7,10]--> middle value-->4
a.median()

22.0

In [None]:
#Q. mean vs median

In [None]:
a #1 18 22 120 9000

In [None]:
# Assignment: create a series for 40,2,4,670,7,90,1,20
# check all the properties for above Series

In [41]:
# mode: most frquent value
a.mode()

0       1
1      18
2      22
3     120
4    9000
dtype: int64

In [42]:
b = pd.Series([100,230,450,100,100])
b

0    100
1    230
2    450
3    100
4    100
dtype: int64

In [43]:
# 100 presents more times
b.mode()

0    100
dtype: int64

In [44]:
n = pd.Series(['Mangesh','Roshni','Roshni','Suvarna'])
n

0    Mangesh
1     Roshni
2     Roshni
3    Suvarna
dtype: object

In [45]:
n.mode()

0    Roshni
dtype: object

In [None]:
# Assignment: Check mode on Hetro. data

### Series Strcuture

In [46]:
print(dir(pd.Series))

['T', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '

## Difference between method and properties

In [47]:
# for properties we need not to call it: means no need of ()
a.values

array([ 120,    1, 9000,   22,   18])

In [48]:
# for methods we need to call : means use ()
a.max

<bound method NDFrame._add_numeric_operations.<locals>.max of 0     120
1       1
2    9000
3      22
4      18
dtype: int64>

In [49]:
# if we get bound method in output then u must have to call it
a.max()

9000

In [50]:
type(a)

pandas.core.series.Series

In [51]:
#check freuqnecy count of each unique category
n

0    Mangesh
1     Roshni
2     Roshni
3    Suvarna
dtype: object

In [52]:
# when we want categorywise count then use value_counts()
n.value_counts()

Roshni     2
Mangesh    1
Suvarna    1
dtype: int64

In [53]:
# check unique records
n.unique()

array(['Mangesh', 'Roshni', 'Suvarna'], dtype=object)

In [54]:
# count unique records
n.nunique()

3

### append()


In [55]:
a,s

(0     120
 1       1
 2    9000
 3      22
 4      18
 dtype: int64,
 0        10
 1     23.44
 2    python
 3     56000
 dtype: object)

In [56]:
# Concatenate two or more Series.
a.append(s)

  a.append(s)


0       120
1         1
2      9000
3        22
4        18
0        10
1     23.44
2    python
3     56000
dtype: object

In [57]:
# problem with index can be solved using ignore_index
a.append(s,ignore_index=True)

  a.append(s,ignore_index=True)


0       120
1         1
2      9000
3        22
4        18
5        10
6     23.44
7    python
8     56000
dtype: object

In [58]:
a.append([s,a,s])

  a.append([s,a,s])


0       120
1         1
2      9000
3        22
4        18
0        10
1     23.44
2    python
3     56000
0       120
1         1
2      9000
3        22
4        18
0        10
1     23.44
2    python
3     56000
dtype: object

## apply()

In [59]:
#s.apply(function)
a

0     120
1       1
2    9000
3      22
4      18
dtype: int64

In [60]:
# add 1000 in each num from a Series
a.apply(lambda num:num+1000)

0     1120
1     1001
2    10000
3     1022
4     1018
dtype: int64

In [61]:
a+1000

0     1120
1     1001
2    10000
3     1022
4     1018
dtype: int64

In [62]:
def square(num):
    return num**2

a.apply(square)

0       14400
1           1
2    81000000
3         484
4         324
dtype: int64

In [None]:
a # changes are temp.

In [63]:
n

0    Mangesh
1     Roshni
2     Roshni
3    Suvarna
dtype: object

In [64]:
# convert names to upper case
n.apply(lambda name:name.upper())

0    MANGESH
1     ROSHNI
2     ROSHNI
3    SUVARNA
dtype: object

In [None]:
# convert name into its len
n.apply(lambda name:len(name))

In [None]:
n.apply(len)

In [None]:
log = pd.Series(['1234-bar234-234234','4545-nmg43345-535545'])
log

In [None]:
log.apply(lambda num:num.split('-'))

In [None]:
log.apply(lambda num:num.split('-')[-1])

In [None]:
v = pd.Series(['Narayan Rane','Ajit Pawar'])
v

In [None]:
v.apply(lambda nm:nm.split())

In [None]:
#fetch first  name
v.apply(lambda nm:nm.split()[0])

In [None]:
pin = pd.Series(['Ap.Satara Karad 542451','Pune Katraj 234567'])
pin

In [None]:
'Pune Katraj 234567'.find('pune') #-1 means missing/absent

In [None]:
#fetch pincode
#pin.apply(lambda p: p.find('Pune'))
pin.apply(lambda data: data.split()[-1])

In [None]:
# using normal function
def find(p):
    return p[-6:]
pin.apply(find)

In [65]:
p = pd.Series(['Mr. Anil','Mr. Uddhav','Mrs. Rashmi','Mr. Aditya'])
p

0       Mr. Anil
1     Mr. Uddhav
2    Mrs. Rashmi
3     Mr. Aditya
dtype: object

In [67]:
p.apply(lambda data:data.split('.')[0])

0     Mr
1     Mr
2    Mrs
3     Mr
dtype: object

## astype(): for typecasting

In [68]:
import pandas as pd
#pd.Series(data,index,dtype)
a = pd.Series([20,3400,506,1,23,30])
a

0      20
1    3400
2     506
3       1
4      23
5      30
dtype: int64

In [69]:
#change data type of Series 'a'
a.astype('float')

0      20.0
1    3400.0
2     506.0
3       1.0
4      23.0
5      30.0
dtype: float64

In [70]:
r = pd.Series(['10','20','30'])
r

0    10
1    20
2    30
dtype: object

In [72]:
r.mean()

34010.0

In [73]:
# in above case there is a problem in answer
# hence we need conversion to int
# so astype will convert from object  to int
r.astype('int').sum()

60

In [74]:
r.astype(int) # temp

0    10
1    20
2    30
dtype: int64

In [75]:
r

0    10
1    20
2    30
dtype: object

In [82]:
data = ['10','30','40']

pd.Series(data,dtype=float)

0    10.0
1    30.0
2    40.0
dtype: float64

In [83]:
data = [2.,4.,5.]
p = pd.Series(data)
p

0    2.0
1    4.0
2    5.0
dtype: float64

In [85]:
p.astype('int')

0    2
1    4
2    5
dtype: int64

### describe()

In [None]:
a = pd.Series([20,3400,506,1,23,30])
a

In [None]:
# Generate descriptive statistics.
# returns statistical information
a.describe()

In [None]:
# describe works on numeric data default
nm = pd.Series(['amit','amol','akshay','amol','amol'])
nm

In [None]:
set(nm)

In [None]:
nm.describe()

## diff()

In [None]:
a

In [None]:
506 - 3400

In [None]:
a.diff()
# calculates difference between consecutive numbers (next_num - prev_num)

In [None]:
d = pd.Series([7,7,7,5,5,5,5,5,5])
d

In [None]:
d.diff()

# work on index

In [None]:
# At the time of creation
pd.Series([10,20,30,40,50])
# default index starts from 0 and stop at n-1

In [None]:
# pd.Series(data,index,dtype)
pd.Series([10,20,30,40,50],['A','B','C','D','E'])

In [None]:
#change sequence
pd.Series(['A','B','C','D','E'],[10,20,30,40,50])

In [None]:
#change sequence
pd.Series(index=['A','B','C','D','E'],data=[10,20,30,40,50])

In [None]:
dt,index = [10,20,30,40,50],['A','B','C','D','E']
pd.Series(dt,index)

In [None]:
pd.Series(dt,range(101,106))

In [None]:
import numpy as np
y = pd.Series(dt,np.arange(21,26))
y

# if we want to change an index of already created Series

In [None]:
y

In [None]:
y.reset_index()

In [None]:
y.reset_index(drop=True)
# drop =  True means it will drop index column generated and returns Series

In [None]:
y