In [1]:
# all imports go at the top, even though python is a very open language
import pandas as pd
import numpy as np

In [2]:
print(dir(pd))

['BooleanDtype', 'Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NA', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseDtype', 'StringDtype', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_is_numpy_dev', '_lib', '_libs', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', '_np_version_under1p17', '_np_version_under1p18', '_testing', '_tslib', '_typing', '_versio

In [3]:
print(dir(pd.DataFrame))

['T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul

## Series

In [4]:
# Normal series and series with custom index
s1 = pd.Series([1,2,3])
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [5]:
# changing dtype: astype
r=s2.astype("str")
print(r)
print(r.values)

# make sure no nan data is there before astype
print()
r = s2.fillna(0)
print(r)
r  = r.astype(np.int8)
print(r)

a    1.0
b    2.0
c    3.0
d    nan
dtype: object
['1.0' '2.0' '3.0' 'nan']

a    1.0
b    2.0
c    3.0
d    0.0
dtype: float64
a    1
b    2
c    3
d    0
dtype: int8


In [6]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.size)
print(s1.dtype)
print(s1.index)
print(s1.values)

print()
print(s2.shape)
print(s2.size)
print(s2.dtype)
print(s2.index)
print(s2.values)

(3,)
3
int64
RangeIndex(start=0, stop=3, step=1)
[1 2 3]

(4,)
4
float64
Index(['a', 'b', 'c', 'd'], dtype='object')
[ 1.  2.  3. nan]


In [7]:
# aggregate operations: min, max, unique, sum etc.

print(s1.min())
print(s2.min())
print(s1.unique())

1
1.0
[1 2 3]


In [8]:
# arithmetic operations(new copy) and broadcast like numpy

In [9]:
# indexing and slicing
print(s1)

print()
print(s1[0])

print()
print(s1[::2])

print()
idx = s1 % 2 == 0
print(idx)
print(s1[idx])

print()
print(s1[[1,2]])

0    1
1    2
2    3
dtype: int64

1

0    1
2    3
dtype: int64

0    False
1     True
2    False
dtype: bool
1    2
dtype: int64

1    2
2    3
dtype: int64


In [10]:
# result of operations as index
sc = s1.copy()
print(sc)
print()

sc[0] = 100
print(sc)

sc[1:] = 200
print(sc)

0    1
1    2
2    3
dtype: int64

0    100
1      2
2      3
dtype: int64
0    100
1    200
2    200
dtype: int64


In [11]:
# frequency : value_counts
sc.value_counts()

200    2
100    1
dtype: int64

## DataFrame

In [12]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [13]:
# shape, size, dtypes, index, columns, unique, T
print(df1.shape)
print(df1.size)
print()
print(df1.dtypes)
print(type(df1.dtypes))

print(df1.dtypes.unique())

(10, 3)
30

Name       object
Age       float64
Gender    float64
dtype: object
<class 'pandas.core.series.Series'>
[dtype('O') dtype('float64')]


In [14]:
# Rename index and columns on df2.
# Inplace vs normal operation
r = df1.rename({'Age':'A', 'Gender':1}, axis=1)
print(r)
print()
print(df1)



        Name    A    1
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [15]:
r = df1.rename(columns={'Age':'A'})
print(r)
print()
print(df1)

        Name    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [16]:
r = df1.rename(columns={'Age':'A', "Gender": 999}, inplace=True) # don't need to store inside r
print(r)
print()
print(df1)

None

        Name    A  999
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0


In [17]:
pd.read_sql_query()

TypeError: read_sql_query() missing 2 required positional arguments: 'sql' and 'con'

In [18]:
# Indexing columns directly and via index.
print(df1)
print(df1.Name)


        Name    A  999
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0
0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object


In [19]:
print(df1)
print()
print(df1["Name"])

        Name    A  999
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object


In [31]:
a = np.arange(12).reshape(3,4)
print(a)

print(type(a[1][1]))
print(type(a[1]), a[1])
print(type(a[1:4]), a[1:4])

print()
print(type(a[1:2,1:1]), a[1:2,1:2])
print(type(a[1][1:2]), a[1][1:2])
print(type(a[1][1]), a[1][1])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
<class 'numpy.int64'>
<class 'numpy.ndarray'> [4 5 6 7]
<class 'numpy.ndarray'> [[ 4  5  6  7]
 [ 8  9 10 11]]

<class 'numpy.ndarray'> [[5]]
<class 'numpy.ndarray'> [5]
<class 'numpy.int64'> 5


In [21]:
print(df1)
print()
print(df1[999])
# print(df1.999)

        Name    A  999
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0

0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
5    1.0
6    1.0
7    NaN
8    0.0
9    0.0
Name: 999, dtype: float64


In [67]:
print(df1[3:9])

print()
print(df2[1:3]["Name"])

print()
r =  df2[1:3]["Name"]
#r[11] = "ABCD"
print(df2)
print(r)

print();print();print()
r = df2.loc[11:3, "Name"]
print(r)
r[2] = "PQR"
print(df2)


        Name    A  999
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0

11    ABCD
2      PQR
Name: Name, dtype: object

         Name  Number
0      Gaurav       1
11       ABCD       2
2         PQR       3
3   Abhishek2       4
11    ABCD
2      PQR
Name: Name, dtype: object



11         ABCD
2           PQR
3     Abhishek2
Name: Name, dtype: object
         Name  Number
0      Gaurav       1
11       ABCD       2
2         PQR       3
3   Abhishek2       4


In [73]:
print(df1)
#print(dir(df1.Name.str))
print()
print(df1.Name.str.len())

print()
df1["NameLen"] = df1.Name.str.len()
print(df1.head(2))

print()
df1.Name = df1.Name.str.upper()
print(df1.head(2))

        Name    A  999  NameLen
0     Gaurav  0.0  0.0      6.0
1  Abhishek1  1.0  0.0      9.0
2    Krishna  2.0  0.0      7.0
3  Abhishek2  2.0  0.0      9.0
4   Harshita  1.0  1.0      8.0
5       Joey  2.0  1.0      4.0
6     Shweta  3.0  1.0      6.0
7       na-1  1.0  NaN      4.0
8       na-2  NaN  0.0      4.0
9        NaN  2.0  0.0      NaN

0    6.0
1    9.0
2    7.0
3    9.0
4    8.0
5    4.0
6    6.0
7    4.0
8    4.0
9    NaN
Name: Name, dtype: float64

        Name    A  999  NameLen
0     Gaurav  0.0  0.0      6.0
1  Abhishek1  1.0  0.0      9.0

        Name    A  999  NameLen
0     GAURAV  0.0  0.0      6.0
1  ABHISHEK1  1.0  0.0      9.0


In [83]:
print(df1.head(2))

print()
df1.NameLen = 10
print(df1.head(2))

        Name    A  999  NameLen
0     GAURAV  0.0  0.0       10
1  ABHISHEK1  1.0  0.0       10

        Name    A  999  NameLen
0     GAURAV  0.0  0.0       10
1  ABHISHEK1  1.0  0.0       10


In [23]:
# Series in a DataFrame
print(type(df1.Name))

<class 'pandas.core.series.Series'>


In [33]:
# describe, top/bottom rows of data
# head, tail, describe

print(df1.head())

        Name    A  999
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0


In [34]:
df1.tail()

Unnamed: 0,Name,A,999
5,Joey,2.0,1.0
6,Shweta,3.0,1.0
7,na-1,1.0,
8,na-2,,0.0
9,,2.0,0.0


In [35]:
df1.head(2)

Unnamed: 0,Name,A,999
0,Gaurav,0.0,0.0
1,Abhishek1,1.0,0.0


In [36]:
df1.tail(5).head(2)

Unnamed: 0,Name,A,999
5,Joey,2.0,1.0
6,Shweta,3.0,1.0


In [40]:
print(df1.shape)
print(df1.describe())

print()
print(df1.describe(include='all'))

(10, 3)
              A       999
count  9.000000  9.000000
mean   1.555556  0.333333
std    0.881917  0.500000
min    0.000000  0.000000
25%    1.000000  0.000000
50%    2.000000  0.000000
75%    2.000000  1.000000
max    3.000000  1.000000

        Name         A       999
count      9  9.000000  9.000000
unique     9       NaN       NaN
top     na-2       NaN       NaN
freq       1       NaN       NaN
mean     NaN  1.555556  0.333333
std      NaN  0.881917  0.500000
min      NaN  0.000000  0.000000
25%      NaN  1.000000  0.000000
50%      NaN  2.000000  0.000000
75%      NaN  2.000000  1.000000
max      NaN  3.000000  1.000000


In [87]:
print(dir(df1))

['A', 'Name', 'NameLen', 'T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul

In [88]:
# indexing and slicing
# loc, iloc [rows, cols]
print(df2)

df2.reindex(columns=["Number", "Name"])

d(mapper/arg , index, columns, axis)

         Name  Number
0      Gaurav       1
11       ABCD       2
2         PQR       3
3   Abhishek2       4


Unnamed: 0,Number,Name
0,1,Gaurav
11,2,ABCD
2,3,PQR
3,4,Abhishek2


In [47]:
# loc : uses the indexes used in DF
print(df2.loc[0])

print()
print(df2.loc[11:3])

print()
print(df2.loc[11:3])

print()
print(df2.loc[11:3, "Name"])

Name      Gaurav
Number         1
Name: 0, dtype: object

         Name  Number
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

         Name  Number
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

11     Abhiskek
2       Krishna
3     Abhishek2
Name: Name, dtype: object


In [51]:

print()
print(df2.loc[11:3, "Name":])

print()
print(df2.loc[ [11,3,2] , "Name":])

print()
print(df2.loc[ : , "Name"])


         Name  Number
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

         Name  Number
11   Abhiskek       2
3   Abhishek2       4
2     Krishna       3

0        Gaurav
11     Abhiskek
2       Krishna
3     Abhishek2
Name: Name, dtype: object


In [56]:
# iloc : uses numeric positional indexes
print(df2.iloc[ 1:])

print()
print(df2.iloc[ 1:, 1])

print()
print(df2.iloc[ 1:2, 0])

print()
print(df2)

         Name  Number
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

11    2
2     3
3     4
Name: Number, dtype: int64

11    Abhiskek
Name: Name, dtype: object

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [94]:
print(df1.tail())
print()
print(df1.sum(axis=0))
print()
print(df1.sum(axis=1))

     Name    A  999  NameLen
5    JOEY  2.0  1.0       10
6  SHWETA  3.0  1.0       10
7    NA-1  1.0  NaN       10
8    NA-2  NaN  0.0       10
9     NaN  2.0  0.0       10

A           14.0
999          3.0
NameLen    100.0
dtype: float64

0    10.0
1    11.0
2    12.0
3    12.0
4    12.0
5    13.0
6    14.0
7    11.0
8    10.0
9    12.0
dtype: float64


In [None]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
df1.count()

In [98]:
# arithmetic operations
df1 * 2

Unnamed: 0,Name,A,999,NameLen
0,GAURAVGAURAV,0.0,0.0,20
1,ABHISHEK1ABHISHEK1,2.0,0.0,20
2,KRISHNAKRISHNA,4.0,0.0,20
3,ABHISHEK2ABHISHEK2,4.0,0.0,20
4,HARSHITAHARSHITA,2.0,2.0,20
5,JOEYJOEY,4.0,2.0,20
6,SHWETASHWETA,6.0,2.0,20
7,NA-1NA-1,2.0,,20
8,NA-2NA-2,,0.0,20
9,,4.0,0.0,20


In [102]:
# Na Values: isna, fillna, dropna
r = df1.dropna(axis=1)
print(r)


   NameLen
0       10
1       10
2       10
3       10
4       10
5       10
6       10
7       10
8       10
9       10
        Name    A  999  NameLen
0     GAURAV  0.0  0.0       10
1  ABHISHEK1  1.0  0.0       10
2    KRISHNA  2.0  0.0       10
3  ABHISHEK2  2.0  0.0       10
4   HARSHITA  1.0  1.0       10
5       JOEY  2.0  1.0       10
6     SHWETA  3.0  1.0       10
7       NA-1  1.0  NaN       10
8       NA-2  NaN  0.0       10
9        NaN  2.0  0.0       10


In [105]:
df1.fillna({'Name':"ABCD", 'A':df1.A.mean()}, axis=0)

Unnamed: 0,Name,A,999,NameLen
0,GAURAV,0.0,0.0,10
1,ABHISHEK1,1.0,0.0,10
2,KRISHNA,2.0,0.0,10
3,ABHISHEK2,2.0,0.0,10
4,HARSHITA,1.0,1.0,10
5,JOEY,2.0,1.0,10
6,SHWETA,3.0,1.0,10
7,NA-1,1.0,,10
8,NA-2,1.555556,0.0,10
9,ABCD,2.0,0.0,10


In [113]:
df1.Name.fillna("AB", inplace=True)

In [114]:
print(df1)

        Name    A  999  NameLen
0     GAURAV  0.0  0.0       10
1  ABHISHEK1  1.0  0.0       10
2    KRISHNA  2.0  0.0       10
3  ABHISHEK2  2.0  0.0       10
4   HARSHITA  1.0  1.0       10
5       JOEY  2.0  1.0       10
6     SHWETA  3.0  1.0       10
7       NA-1  1.0  0.0       10
8       NA-2  2.0  0.0       10
9         AB  2.0  0.0       10


In [107]:
df1.fillna(method='ffill', axis=0)

Unnamed: 0,Name,A,999,NameLen
0,GAURAV,0.0,0.0,10
1,ABHISHEK1,1.0,0.0,10
2,KRISHNA,2.0,0.0,10
3,ABHISHEK2,2.0,0.0,10
4,HARSHITA,1.0,1.0,10
5,JOEY,2.0,1.0,10
6,SHWETA,3.0,1.0,10
7,NA-1,1.0,1.0,10
8,NA-2,1.0,0.0,10
9,NA-2,2.0,0.0,10


In [110]:
df1.fillna(method='bfill', axis=0, inplace=True)

In [111]:
print(df1)

        Name    A  999  NameLen
0     GAURAV  0.0  0.0       10
1  ABHISHEK1  1.0  0.0       10
2    KRISHNA  2.0  0.0       10
3  ABHISHEK2  2.0  0.0       10
4   HARSHITA  1.0  1.0       10
5       JOEY  2.0  1.0       10
6     SHWETA  3.0  1.0       10
7       NA-1  1.0  0.0       10
8       NA-2  2.0  0.0       10
9        NaN  2.0  0.0       10


In [115]:
print(df1.isna())

    Name      A    999  NameLen
0  False  False  False    False
1  False  False  False    False
2  False  False  False    False
3  False  False  False    False
4  False  False  False    False
5  False  False  False    False
6  False  False  False    False
7  False  False  False    False
8  False  False  False    False
9  False  False  False    False


In [122]:
df3 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df4 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])

In [121]:
# any, all
print(df3)

print()
r = df3.isna()
print(r.any())

print()
print(r.all())

print()
print(df3.count())

print()
print(r.sum()) # df3.isna().sum()

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

Name      True
Age       True
Gender    True
dtype: bool

Name      False
Age       False
Gender    False
dtype: bool

Name      9
Age       9
Gender    9
dtype: int64

Name      1
Age       1
Gender    1
dtype: int64


In [127]:
print(df4)
r = df4.Number > 0
print()
print(r)
print()
print(r.any())
print(r.all())

print()
r = df4.Number %2 == 0
print()
print(r)
print()
print(r.any())
print(r.all())

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4

0     True
11    True
2     True
3     True
Name: Number, dtype: bool

True
True


0     False
11     True
2     False
3      True
Name: Number, dtype: bool

True
False


In [130]:
# ordering data, sort_values
df1.sort_values(by=['A'])

Unnamed: 0,Name,A,999,NameLen
0,GAURAV,0.0,0.0,10
1,ABHISHEK1,1.0,0.0,10
4,HARSHITA,1.0,1.0,10
7,NA-1,1.0,0.0,10
2,KRISHNA,2.0,0.0,10
3,ABHISHEK2,2.0,0.0,10
5,JOEY,2.0,1.0,10
8,NA-2,2.0,0.0,10
9,AB,2.0,0.0,10
6,SHWETA,3.0,1.0,10


In [131]:

df1.sort_values(by=['A', "Name"])

Unnamed: 0,Name,A,999,NameLen
0,GAURAV,0.0,0.0,10
1,ABHISHEK1,1.0,0.0,10
4,HARSHITA,1.0,1.0,10
7,NA-1,1.0,0.0,10
9,AB,2.0,0.0,10
3,ABHISHEK2,2.0,0.0,10
5,JOEY,2.0,1.0,10
2,KRISHNA,2.0,0.0,10
8,NA-2,2.0,0.0,10
6,SHWETA,3.0,1.0,10


In [135]:
print(df4)
print(df4.sort_index(ascending=False))

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4
         Name  Number
11   Abhiskek       2
3   Abhishek2       4
2     Krishna       3
0      Gaurav       1


In [142]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))
print(tmp)
print()
print(tmp.apply(np.mean))

import math
print(tmp.applymap(math.sqrt))

    0   1   2   3
0  17  12  19  18
1  12  16  11  15
2  14  18  18  11
3  11  14  14  17
4  18  16  18  10

0    14.4
1    15.2
2    16.0
3    14.2
dtype: float64
          0         1         2         3
0  4.123106  3.464102  4.358899  4.242641
1  3.464102  4.000000  3.316625  3.872983
2  3.741657  4.242641  4.242641  3.316625
3  3.316625  3.741657  3.741657  4.123106
4  4.242641  4.000000  4.242641  3.162278


In [148]:
s = "abcdae"

s.replace("a", "A")

'AbcdAe'

In [144]:
# str submodule (replace etc)
r = df1.Name.str.replace('A', 'a')
print(df1)
print(r)

        Name    A  999  NameLen
0     GAURAV  0.0  0.0       10
1  ABHISHEK1  1.0  0.0       10
2    KRISHNA  2.0  0.0       10
3  ABHISHEK2  2.0  0.0       10
4   HARSHITA  1.0  1.0       10
5       JOEY  2.0  1.0       10
6     SHWETA  3.0  1.0       10
7       NA-1  1.0  0.0       10
8       NA-2  2.0  0.0       10
9         AB  2.0  0.0       10
0       GaURaV
1    aBHISHEK1
2      KRISHNa
3    aBHISHEK2
4     HaRSHITa
5         JOEY
6       SHWETa
7         Na-1
8         Na-2
9           aB
Name: Name, dtype: object


In [145]:
df1.Name.replace("JOEY", "YYYY")

0       GAURAV
1    ABHISHEK1
2      KRISHNA
3    ABHISHEK2
4     HARSHITA
5         YYYY
6       SHWETA
7         NA-1
8         NA-2
9           AB
Name: Name, dtype: object

In [149]:
df1.Name.replace("A", "a")

0       GAURAV
1    ABHISHEK1
2      KRISHNA
3    ABHISHEK2
4     HARSHITA
5         JOEY
6       SHWETA
7         NA-1
8         NA-2
9           AB
Name: Name, dtype: object

In [146]:
df1.A.replace(1.0, 3333.333)

0       0.000
1    3333.333
2       2.000
3       2.000
4    3333.333
5       2.000
6       3.000
7    3333.333
8       2.000
9       2.000
Name: A, dtype: float64

In [150]:
# load and save data
print()
print(dir(pd))


['BooleanDtype', 'Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NA', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseDtype', 'StringDtype', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_is_numpy_dev', '_lib', '_libs', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', '_np_version_under1p17', '_np_version_under1p18', '_testing', '_tslib', '_typing', '_versi

In [151]:
tmp.to_csv('temp.csv')

In [161]:
r = pd.read_csv('temp.csv', index_col=0, skiprows=3)#,header=list("ABCD"))
print(r)

   14  18  18.1  11
2                  
3  11  14    14  17
4  18  16    18  10


In [None]:
# groups: groupby

In [None]:
# groups and aggregates