In [1]:
# all imports go at the top, even though python is a very open language
import numpy as np
import pandas as pd 

## Series

In [50]:
# Normal series and series with custom index
import pandas as pd
import numpy as np
s1 = pd.Series([1,2,3])
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [9]:
# changing dtype: astype
print(s1.astype('float64'))

0    1.0
1    2.0
2    3.0
dtype: float64


In [17]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.size)
print(s1.dtype)
print(s1.index)
print(s1.values)
print(type(s1.values))
print(type(s1))

print()
print(s2.index)

(3,)
3
int64
RangeIndex(start=0, stop=3, step=1)
[1 2 3]
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>

Index(['a', 'b', 'c', 'd'], dtype='object')


In [51]:
# aggregate operations: min, max, unique, sum etc.
print(s1.min())
print(s1.max())
print(s1.sum())
print()
print(s2.min())
print(s2.max())
print(s2.sum())
print()
r=pd.Series([1,11,2,2,3])
print(r.unique()) #gives unique elements

1
3
6

1.0
3.0
6.0

[ 1 11  2  3]


In [6]:
# arithmetic operations(new copy) and broadcast like numpy

s1=pd.Series([10,20,30])
s2=pd.Series([1,2,3,7],index=[2,3,4,5])
sum=s1+s2
print('Sum of series: ',sum)
print()
diff=s1-s2
print('Difference is: ',diff)
print()
mul=s1*s2
print('Product is : ',mul)
print()
div=s1/s2
print('Division gives: ',div)
print()

Sum of series:  0     NaN
1     NaN
2    31.0
3     NaN
4     NaN
5     NaN
dtype: float64

Difference is:  0     NaN
1     NaN
2    29.0
3     NaN
4     NaN
5     NaN
dtype: float64

Product is :  0     NaN
1     NaN
2    30.0
3     NaN
4     NaN
5     NaN
dtype: float64

Division gives:  0     NaN
1     NaN
2    30.0
3     NaN
4     NaN
5     NaN
dtype: float64



In [46]:
# indexing and slicing

s1=pd.Series([10,20,30,40])
s2=pd.Series([1,2,3,7],index=[2,3,4,5])
print(s1[1:3])   #slicing
print('Type of slicing: ',type(s1[1:2]))
print(s2[5])     #indexing
print('Type of indexing: ',type(s2[5]))
print()
print('S2 slicing: ',s2[3:4])
print()
print('Boolean indexing gives: ',s2[[True,True,False,True]]) #Boolean indexing


1    20
2    30
dtype: int64
Type of slicing:  <class 'pandas.core.series.Series'>
7
Type of indexing:  <class 'numpy.int64'>

S2 slicing:  5    7
dtype: int64

Boolean indexing gives:  2    1
3    2
5    7
dtype: int64


In [42]:
q=pd.Series([10,20,30,40,50])
print(q)
print()
print(q[2:10:2])

0    10
1    20
2    30
3    40
4    50
dtype: int64

2    30
4    50
dtype: int64


In [22]:
# result of operations as index
print(type(s2['c']))
print(type(s2['c' :]))

<class 'numpy.float64'>
<class 'pandas.core.series.Series'>


In [55]:
# frequency : value_counts

s1=pd.Series(list('abcdddbc'))
print(s1)
print()
print(s1.value_counts())
print()

r1=s1.value_counts()
print(r1.sort_index(ascending=True)) # for ordering in ascending order

0    a
1    b
2    c
3    d
4    d
5    d
6    b
7    c
dtype: object

d    3
b    2
c    2
a    1
dtype: int64

a    1
b    2
c    2
d    3
dtype: int64


## DataFrame

In [2]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [70]:
# shape, size, dtype, index, columns, unique, T

print(df1.shape)
print(df1.size)
print(df1.dtypes)
print(df1.index)
print(df2.index)
print(df1.columns)
print(df2.columns)
print(type(df1))
print(df1.T)

(10, 3)
30
Name       object
Age       float64
Gender    float64
dtype: object
RangeIndex(start=0, stop=10, step=1)
Int64Index([0, 11, 2, 3], dtype='int64')
Index(['Name', 'Age', 'Gender'], dtype='object')
Index(['Name', 'Number'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0  


In [107]:
# Rename index and columns on df2.
# Inplace vs normal operation
t=df2.rename(columns={"name":"Name","number":"Num"},index={0:4,11:12})
print(t)
print()
#print(t[2])
r=df2.rename(lambda x : x.upper(),axis=1,inplace=True)
print(r)
print(df2)

         Name  Num
4      Gaurav    1
12   Abhiskek    2
2     Krishna    3
3   Abhishek2    4

None
         NAME  NUMBER
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [125]:
# Indexing columns directly and via index. 
print(df1.columns)
print()
print(df1.Gender)
print()
print(df2.NAME)

Index(['Name', 'Age', 'Gender'], dtype='object')

0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
5    1.0
6    1.0
7    NaN
8    0.0
9    0.0
Name: Gender, dtype: float64

0        Gaurav
11     Abhiskek
2       Krishna
3     Abhishek2
Name: NAME, dtype: object


In [37]:
# Series in a DataFrame

print(type(df1.Name))

<class 'pandas.core.series.Series'>


In [126]:
print(df1.head(2))
print(df1.tail()) # default is 5 if argument is not given

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0


In [15]:
# describe, top/bottom rows of data
print('Dataframe: \n',df1)
print()
print('Top 5 rows: \n',df1.head().describe(include='all'))
print()
print('Bottom 5 rows: \n',df1.tail().describe(include='all'))
print()
print('All rows: \n',df1.describe(include='all'))
print()
print('All object type elements: \n',df1.describe(include='object'))
print(df1.count()) #gives a count of non nan values

Dataframe: 
         Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

Top 5 rows: 
              Name      Age    Gender
count           5  5.00000  5.000000
unique          5      NaN       NaN
top     Abhishek2      NaN       NaN
freq            1      NaN       NaN
mean          NaN  1.20000  0.200000
std           NaN  0.83666  0.447214
min           NaN  0.00000  0.000000
25%           NaN  1.00000  0.000000
50%           NaN  1.00000  0.000000
75%           NaN  2.00000  0.000000
max           NaN  2.00000  1.000000

Bottom 5 rows: 
         Name       Age   Gender
count      4  4.000000  4.00000
unique     4       NaN      NaN
top     Joey       NaN      NaN
freq       1       NaN      NaN
mean     NaN  2.000000  0.50000
std      NaN  0.816497  0.57735


In [80]:
# indexing and slicing
# loc, iloc [rows, cols]
print(df1['Gender'])
print()
print(df1.loc[0:2])
print()
print(df1.loc[0:4:2])
print()
print(df1.loc[[0,2,5]])
print()
print(df1.loc[[2,3,7],['Name','Age']])
print()
print(df1.loc[::3,['Name','Age']])
print()
print(df1.iloc[::3,[0,2]])
print()
print(df1.iloc[0:2])
print()
print(df1.iloc[:4,[0,1]])
print()
print(df1.iloc[[1,4,5],[1,2]])

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0

        Name  Age
0     Gaurav  0.0
1  Abhishek1  1.0
2    Krishna  2.0
3  Abhishek2  2.0

   Age  Gender
1  1.0     0.0
4  1.0     1.0
5  2.0     1.0


In [31]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...

print('Dataframe:\n',df1)
print()
print('Min :\n',df1.min())#min from each column
print()
print('max:\n',df1.max())
print()
print('Sum:\n',df1.sum())
print()
print('Min from rows: \n',df1.min(axis=1)) #gives minimum value from each row
print()
print('Max from rows: \n',df1.max(axis=1))
print()
print('Sum from rows: \n',df1.sum(axis=1))
print()
print('Count from rows: \n',df1.count(axis=1))
print()
print('Count from columns: \n',df1.count(axis=0))
print()
print(df1.Gender.min())

Dataframe:
         Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

Min :
 Age       0.0
Gender    0.0
dtype: float64

max:
 Age       3.0
Gender    1.0
dtype: float64

Sum:
 Age       14.0
Gender     3.0
dtype: float64

Min from rows: 
 0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
5    1.0
6    1.0
7    1.0
8    0.0
9    0.0
dtype: float64

Max from rows: 
 0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    0.0
9    2.0
dtype: float64

Sum from rows: 
 0    0.0
1    1.0
2    2.0
3    2.0
4    2.0
5    3.0
6    4.0
7    1.0
8    0.0
9    2.0
dtype: float64

Count from rows: 
 0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    2
8    2
9    2
dtype: int64

Count from columns: 
 Name      9
Age       9
Gender    9
dtype: int64

0.0


In [36]:
# arithmetic operations
print(df1.Age.head()+df2.Number)
print(df1.Age-df2.Number)

0     1.0
1     NaN
2     5.0
3     6.0
4     NaN
11    NaN
dtype: float64
0    -1.0
1     NaN
2    -1.0
3    -2.0
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
11    NaN
dtype: float64


In [48]:
# Na Values: isna, fillna, dropna
print(df1.isna())
print(df1)
#print(dft.fillna(0))
print()
print(df1.fillna(method='bfill'))
print()
r=df1.fillna(method='ffill',inplace=True)
print(df1)
print()
print(df1.dropna())

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0


In [53]:
# any, all
print(df1.any(axis=1))
print()
print(df1.all(axis=1))
print()
print(df1.all())
print()
print(df1.any(axis=0))

0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
8    True
9    True
dtype: bool

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8    False
9    False
dtype: bool

Name       True
Age       False
Gender    False
dtype: bool

Name      True
Age       True
Gender    True
dtype: bool


In [60]:
# ordering data, sort_values
print(df1.sort_values(by='Age',axis=0,ascending=True))
print()
print(df1.sort_values(by='Name',ascending=False))

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
4   Harshita  1.0     1.0
7       na-1  1.0     NaN
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
5       Joey  2.0     1.0
9        NaN  2.0     0.0
6     Shweta  3.0     1.0
8       na-2  NaN     0.0

        Name  Age  Gender
8       na-2  NaN     0.0
7       na-1  1.0     NaN
6     Shweta  3.0     1.0
2    Krishna  2.0     0.0
5       Joey  2.0     1.0
4   Harshita  1.0     1.0
0     Gaurav  0.0     0.0
3  Abhishek2  2.0     0.0
1  Abhishek1  1.0     0.0
9        NaN  2.0     0.0


In [101]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4),columns=['A','B','C','D'])
print(tmp)
print()
print(tmp.apply(lambda x:x*2))
print()
print(tmp.A.apply(lambda x:x*2))
print(tmp.loc[4].apply(lambda x:x*2))



    A   B   C   D
0  10  16  17  18
1  18  12  12  11
2  14  12  18  15
3  13  12  15  13
4  18  17  10  15

    A   B   C   D
0  20  32  34  36
1  36  24  24  22
2  28  24  36  30
3  26  24  30  26
4  36  34  20  30

0    20
1    36
2    28
3    26
4    36
Name: A, dtype: int64
A    36
B    34
C    20
D    30
Name: 4, dtype: int64


In [121]:
# str submodule (replace etc)
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4),columns=['A','B','C','D'])
print(tmp)
print()
print(tmp.replace(to_replace=11,value=np.nan))
print()
df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
print(df)
print(df.replace(to_replace=r'^b.*',value='newword',regex=True))

      A    B
0   bat  abc
1   foo  bar
2  bait  xyz
         A        B
0  newword      abc
1      foo  newword
2  newword      xyz


In [2]:
# load and save data
load=pd.read_csv('titanic_filtered.csv')
# print(load.tail(10))
print(load)

NameError: name 'pd' is not defined

In [7]:
# groups: groupby
grp=df1.groupby(by=['Age'])
print(grp.groups)
# grp=df1.groupby(by=['Age','Gender'])
# print(grp.groups)
#print(grp.ngroup())
print(grp.mean())

{0.0: Int64Index([0], dtype='int64'), 1.0: Int64Index([1, 4, 7], dtype='int64'), 2.0: Int64Index([2, 3, 5, 9], dtype='int64'), 3.0: Int64Index([6], dtype='int64')}
     Gender
Age        
0.0    0.00
1.0    0.50
2.0    0.25
3.0    1.00


In [10]:
# groups and aggregates
print(grp.mean())
print(grp.sum())
print(grp.count())


     Gender
Age        
0.0    0.00
1.0    0.50
2.0    0.25
3.0    1.00
     Gender
Age        
0.0     0.0
1.0     1.0
2.0     1.0
3.0     1.0
     Name  Gender
Age              
0.0     1       1
1.0     3       2
2.0     3       4
3.0     1       1
{0.0: Int64Index([0], dtype='int64'), 1.0: Int64Index([1, 4, 7], dtype='int64'), 2.0: Int64Index([2, 3, 5, 9], dtype='int64'), 3.0: Int64Index([6], dtype='int64')}
