In [1]:
# all imports go at the top, even though python is a very open language
import numpy as np
import pandas as pd

In [2]:
print(dir(pd))

['BooleanDtype', 'Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NA', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseDtype', 'StringDtype', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_is_numpy_dev', '_lib', '_libs', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', '_np_version_under1p17', '_np_version_under1p18', '_testing', '_tslib', '_typing', '_versio

## Series

In [3]:
# Normal series and series with custom index
# 1-D numpy array
s1 = pd.Series([1,2,3], index=range(3))
s2 = pd.Series([1,2,3, np.nan], index=tuple("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [4]:
print(type(s1))
print(s1.index)
print(s1.values, type(s1.values))

<class 'pandas.core.series.Series'>
RangeIndex(start=0, stop=3, step=1)
[1 2 3] <class 'numpy.ndarray'>


In [5]:
# changing dtype: astype
print(s1.dtype)
r = s1.astype("unicode")
print(r, r.dtype)
print(r.values.dtype)

r = s1.astype("float64")
print(r)

print()
print(s1)

print()
r = pd.Series(s1, dtype="<U2")
print(r)

int64
0    1
1    2
2    3
dtype: object object
object
0    1.0
1    2.0
2    3.0
dtype: float64

0    1
1    2
2    3
dtype: int64

0    1
1    2
2    3
dtype: object


In [6]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.size)
print(s1.dtype)

(3,)
3
int64


In [7]:
# aggregate operations: min, max, unique, sum etc.
print(s1.unique())
print(s2.min())
print(s2.max())

[1 2 3]
1.0
3.0


In [8]:
# arithmetic operations(new copy) and broadcast like numpy
print(s1 * 3) # with scalar
print(s1)


0    3
1    6
2    9
dtype: int64
0    1
1    2
2    3
dtype: int64


In [9]:
print(s1 + s2)

0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64


In [10]:
s3 = pd.Series([10,20,30], index = (1,2,4))
print(s1)
print(s3)
print()
print(s1 + s3)

0    1
1    2
2    3
dtype: int64
1    10
2    20
4    30
dtype: int64

0     NaN
1    12.0
2    23.0
4     NaN
dtype: float64


In [11]:
# indexing and slicing
print(s1)
print()
print(s1[1:])

print()
print(s1[:2])

print()
print(s1[2])

0    1
1    2
2    3
dtype: int64

1    2
2    3
dtype: int64

0    1
1    2
dtype: int64

3


In [12]:
s1[2] = 10
print(s1)

s1[1:] = 20
print(s1)

0     1
1     2
2    10
dtype: int64
0     1
1    20
2    20
dtype: int64


In [13]:
print(s1[:-1])

0     1
1    20
dtype: int64


In [14]:
print(s2[1]) # internal index
print(s2['a']) # user assigned index

2.0
1.0


In [15]:
# use the user assigned index
print(s2.loc['a'])
print(s2.loc['a':'c'])
#print(s2.loc[1])

1.0
a    1.0
b    2.0
c    3.0
dtype: float64


In [16]:
# use the user assigned index
print(s2.iloc[1:3])
#print(s2.iloc['a']

print()
print(s2.iloc[::-1])

print()
print(s2.iloc[-1])

print()
print(s2.iloc[-1:-3:-1])

b    2.0
c    3.0
dtype: float64

d    NaN
c    3.0
b    2.0
a    1.0
dtype: float64

nan

d    NaN
c    3.0
dtype: float64


In [17]:
# result of operations as index
print(s1)

print()
print(s1[ [2,1] ])
idx = [2,-1] # list of indexes to pick
print(s1.iloc[ idx ])

print()
r = s1 %2 == 0
print(s1[r])

0     1
1    20
2    20
dtype: int64

2    20
1    20
dtype: int64
2    20
2    20
dtype: int64

1    20
2    20
dtype: int64


In [18]:
# frequency : value_counts
r = s1.value_counts()
print(r)

print()
s4 = pd.Series([10,20,10,10,30,20])
s4.value_counts()

20    2
1     1
dtype: int64



10    3
20    2
30    1
dtype: int64

## DataFrame

In [19]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [20]:
# shape, size, dtype, index, columns, unique, T
print(df1.shape)
print(df1.size)

print()
print(df1.dtypes, type(df1.dtypes))

print()
print(df1.index)

print()
print(df1.columns)

print()
print(df1.T)

(10, 3)
30

Name       object
Age       float64
Gender    float64
dtype: object <class 'pandas.core.series.Series'>

RangeIndex(start=0, stop=10, step=1)

Index(['Name', 'Age', 'Gender'], dtype='object')

             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0  


In [21]:
print(df1.describe(include='all'))

        Name       Age    Gender
count      9  9.000000  9.000000
unique     9       NaN       NaN
top     na-1       NaN       NaN
freq       1       NaN       NaN
mean     NaN  1.555556  0.333333
std      NaN  0.881917  0.500000
min      NaN  0.000000  0.000000
25%      NaN  1.000000  0.000000
50%      NaN  2.000000  0.000000
75%      NaN  2.000000  1.000000
max      NaN  3.000000  1.000000


In [22]:
['a','b','b', 'a', 'c', 'a']

['a', 'b', 'b', 'a', 'c', 'a']

In [23]:
print(df1.mode(axis=0))

        Name  Age  Gender
0  Abhishek1  2.0     0.0
1  Abhishek2  NaN     NaN
2     Gaurav  NaN     NaN
3   Harshita  NaN     NaN
4       Joey  NaN     NaN
5    Krishna  NaN     NaN
6     Shweta  NaN     NaN
7       na-1  NaN     NaN
8       na-2  NaN     NaN


In [24]:
print(s4)
print(s4.mode())

print()
print(s1)
print(s1.mode())

0    10
1    20
2    10
3    10
4    30
5    20
dtype: int64
0    10
dtype: int64

0     1
1    20
2    20
dtype: int64
0    20
dtype: int64


In [25]:
print(df1.head())  #give me the first five rows

print()
print(df1.head(2)) 

print()
print(df1.head(1)) 

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0

     Name  Age  Gender
0  Gaurav  0.0     0.0


In [26]:
print(df1.tail())  #give me the first five rows

print()
print(df1.tail(2)) 

print()
print(df1.tail(1)) 

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0

   Name  Age  Gender
8  na-2  NaN     0.0
9   NaN  2.0     0.0

  Name  Age  Gender
9  NaN  2.0     0.0


In [27]:
# Rename index and columns on df2.
# Inplace vs normal operation

r = df1.rename(mapper={"Name": "N"}, axis=1, inplace=True) # column names
print(r)
print()
print(df1.head())

None

           N  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0


In [28]:
r = df1.rename(columns={"N": "Name"}, inplace=True) # column names
print(r)
print()
print(df1.head())

None

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0


In [29]:
r = df1.rename(mapper=lambda x: x*10, axis=0, inplace=True) # row  index
print(r)
print()
print(df1.head())

None

         Name  Age  Gender
0      Gaurav  0.0     0.0
10  Abhishek1  1.0     0.0
20    Krishna  2.0     0.0
30  Abhishek2  2.0     0.0
40   Harshita  1.0     1.0


In [30]:
# Indexing columns directly and via index. 
print(df1["Name"].head(2))

print()
print(df1.Name.head())

0        Gaurav
10    Abhishek1
Name: Name, dtype: object

0        Gaurav
10    Abhishek1
20      Krishna
30    Abhishek2
40     Harshita
Name: Name, dtype: object


In [31]:
df1.replace("Gaurav", "Gaurav-1", inplace=True)
print(df1)

         Name  Age  Gender
0    Gaurav-1  0.0     0.0
10  Abhishek1  1.0     0.0
20    Krishna  2.0     0.0
30  Abhishek2  2.0     0.0
40   Harshita  1.0     1.0
50       Joey  2.0     1.0
60     Shweta  3.0     1.0
70       na-1  1.0     NaN
80       na-2  NaN     0.0
90        NaN  2.0     0.0


In [32]:
df1.Age.replace({1.0:10, 2.0:200}, inplace=True)
print(df1)

         Name    Age  Gender
0    Gaurav-1    0.0     0.0
10  Abhishek1   10.0     0.0
20    Krishna  200.0     0.0
30  Abhishek2  200.0     0.0
40   Harshita   10.0     1.0
50       Joey  200.0     1.0
60     Shweta    3.0     1.0
70       na-1   10.0     NaN
80       na-2    NaN     0.0
90        NaN  200.0     0.0


In [33]:
# Series in a DataFrame
print(type(df1.Name))

<class 'pandas.core.series.Series'>


In [34]:
# describe, top/bottom rows of data
df1.head
df1.tail

<bound method NDFrame.tail of          Name    Age  Gender
0    Gaurav-1    0.0     0.0
10  Abhishek1   10.0     0.0
20    Krishna  200.0     0.0
30  Abhishek2  200.0     0.0
40   Harshita   10.0     1.0
50       Joey  200.0     1.0
60     Shweta    3.0     1.0
70       na-1   10.0     NaN
80       na-2    NaN     0.0
90        NaN  200.0     0.0>

In [35]:
# indexing and slicing
# loc, iloc [rows, cols]

In [36]:
print(df2)

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [37]:
print(df2.iloc[0])

print()
print(df2.iloc[ 0, :]) # row-0 col-all

print()
print(df2.iloc[0,1]) # row-0 col-1

print()
print(df2.iloc[ [1,2] ]) # row-selection

print()
print(df2.iloc[ [1,2], [0,1] ]) #  row and  columnn selection

Name      Gaurav
Number         1
Name: 0, dtype: object

Name      Gaurav
Number         1
Name: 0, dtype: object

1

        Name  Number
11  Abhiskek       2
2    Krishna       3

        Name  Number
11  Abhiskek       2
2    Krishna       3


In [38]:
r1 = df2  ==  2
r2 = df2  == "Krishna"
print(r1)
print(r2)

print()
r3  = r1 | r2
print(r3)

df2[r3]

     Name  Number
0   False   False
11  False    True
2   False   False
3   False   False
     Name  Number
0   False   False
11  False   False
2    True   False
3   False   False

     Name  Number
0   False   False
11  False    True
2    True   False
3   False   False


  res_values = method(rvalues)


Unnamed: 0,Name,Number
0,,
11,,2.0
2,Krishna,
3,,


In [39]:

print()
print(df2.loc[ [11,2] ]) # row-selection

print()
print(df2.loc[11])


print()
print(df2.loc[:, "Name"])

print()
print(df2.loc[11:, "Number"])

print()
print(df2.loc[11, "Name"])


        Name  Number
11  Abhiskek       2
2    Krishna       3

Name      Abhiskek
Number           2
Name: 11, dtype: object

0        Gaurav
11     Abhiskek
2       Krishna
3     Abhishek2
Name: Name, dtype: object

11    2
2     3
3     4
Name: Number, dtype: int64

Abhiskek


In [40]:
print(s4)
s4.name = 'S4'
print(s4)

0    10
1    20
2    10
3    10
4    30
5    20
dtype: int64
0    10
1    20
2    10
3    10
4    30
5    20
Name: S4, dtype: int64


In [41]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
print(df1.shape)
print()
print(df1.count()) # count non-na values

print()
print(df1.count(axis=1)) # count non-na values

(10, 3)

Name      9
Age       9
Gender    9
dtype: int64

0     3
10    3
20    3
30    3
40    3
50    3
60    3
70    2
80    2
90    2
dtype: int64


In [42]:
# arithmetic operations
print(df1.head())
df1.Age = df1.Age + 1

print(df1.head())

         Name    Age  Gender
0    Gaurav-1    0.0     0.0
10  Abhishek1   10.0     0.0
20    Krishna  200.0     0.0
30  Abhishek2  200.0     0.0
40   Harshita   10.0     1.0
         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0


In [43]:
# Na Values: isna, fillna, dropna

In [44]:
# any, all
r = df1.isna()
print(r)

     Name    Age  Gender
0   False  False   False
10  False  False   False
20  False  False   False
30  False  False   False
40  False  False   False
50  False  False   False
60  False  False   False
70  False  False    True
80  False   True   False
90   True  False   False


In [45]:
print(r.any())
print()
print(r.any(axis=1))

Name      True
Age       True
Gender    True
dtype: bool

0     False
10    False
20    False
30    False
40    False
50    False
60    False
70     True
80     True
90     True
dtype: bool


In [46]:
print(df1.all(skipna=False))
print(df1)

Name      NaN
Age       201
Gender      0
dtype: object
         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0
70       na-1   11.0     NaN
80       na-2    NaN     0.0
90        NaN  201.0     0.0


In [47]:
print(df1.all(skipna=True))
print(df1)

Name       True
Age        True
Gender    False
dtype: bool
         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0
70       na-1   11.0     NaN
80       na-2    NaN     0.0
90        NaN  201.0     0.0


In [48]:
print(1 and 0)
print(1 and 10)

print()
print(1 or 0)
print('' or 10 or 0)
print('' or 0.0)

0
10

1
10
0.0


In [49]:
# ordering data, sort_values
df2.sort_index()

Unnamed: 0,Name,Number
0,Gaurav,1
2,Krishna,3
3,Abhishek2,4
11,Abhiskek,2


In [50]:
df2.sort_values(by='Name')

Unnamed: 0,Name,Number
3,Abhishek2,4
11,Abhiskek,2
0,Gaurav,1
2,Krishna,3


In [51]:
df1.sort_values(by='Name')

Unnamed: 0,Name,Age,Gender
10,Abhishek1,11.0,0.0
30,Abhishek2,201.0,0.0
0,Gaurav-1,1.0,0.0
40,Harshita,11.0,1.0
50,Joey,201.0,1.0
20,Krishna,201.0,0.0
60,Shweta,4.0,1.0
70,na-1,11.0,
80,na-2,,0.0
90,,201.0,0.0


In [52]:
df1.sort_values(by=["Age"])

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
60,Shweta,4.0,1.0
10,Abhishek1,11.0,0.0
40,Harshita,11.0,1.0
70,na-1,11.0,
20,Krishna,201.0,0.0
30,Abhishek2,201.0,0.0
50,Joey,201.0,1.0
90,,201.0,0.0
80,na-2,,0.0


In [53]:
df1.sort_values(by=["Age", "Name"])

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
60,Shweta,4.0,1.0
10,Abhishek1,11.0,0.0
40,Harshita,11.0,1.0
70,na-1,11.0,
30,Abhishek2,201.0,0.0
50,Joey,201.0,1.0
20,Krishna,201.0,0.0
90,,201.0,0.0
80,na-2,,0.0


In [54]:
df1.sort_values(by=["Age", "Name"], ignore_index=True)

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
1,Shweta,4.0,1.0
2,Abhishek1,11.0,0.0
3,Harshita,11.0,1.0
4,na-1,11.0,
5,Abhishek2,201.0,0.0
6,Joey,201.0,1.0
7,Krishna,201.0,0.0
8,,201.0,0.0
9,na-2,,0.0


In [55]:
print(df1)

         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0
70       na-1   11.0     NaN
80       na-2    NaN     0.0
90        NaN  201.0     0.0


In [56]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))

In [57]:
print(tmp)

    0   1   2   3
0  14  13  13  19
1  15  10  17  11
2  19  10  10  10
3  14  17  17  11
4  16  13  19  17


In [58]:
print(tmp.apply(max)) # column

0    19
1    17
2    19
3    19
dtype: int64


In [59]:
print(tmp.apply(lambda x: np.sqrt(x)))

          0         1         2         3
0  3.741657  3.605551  3.605551  4.358899
1  3.872983  3.162278  4.123106  3.316625
2  4.358899  3.162278  3.162278  3.162278
3  3.741657  4.123106  4.123106  3.316625
4  4.000000  3.605551  4.358899  4.123106


In [60]:
print(tmp.apply(np.sqrt)) # column

          0         1         2         3
0  3.741657  3.605551  3.605551  4.358899
1  3.872983  3.162278  4.123106  3.316625
2  4.358899  3.162278  3.162278  3.162278
3  3.741657  4.123106  4.123106  3.316625
4  4.000000  3.605551  4.358899  4.123106


In [61]:
tmp.iloc[0]

0    14
1    13
2    13
3    19
Name: 0, dtype: int64

In [62]:
tmp.applymap(lambda x: x**2)

Unnamed: 0,0,1,2,3
0,196,169,169,361
1,225,100,289,121
2,361,100,100,100
3,196,289,289,121
4,256,169,361,289


In [63]:
# str submodule (replace etc)
df1.Name.str

<pandas.core.strings.StringMethods at 0x7f045004e1c0>

In [64]:
print(df1.head())

# select Name, Age from Table where Name like  "A%" and Age < 100;

idx = (df1.Name.str.startswith("A")) & (df1.Age < 100)
print(idx)
df1[idx].loc[ :,["Name", "Age"] ]

         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
0     False
10     True
20    False
30    False
40    False
50    False
60    False
70    False
80    False
90    False
dtype: bool


Unnamed: 0,Name,Age
10,Abhishek1,11.0


In [65]:
# load and save data

In [66]:
print(df1)

         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0
70       na-1   11.0     NaN
80       na-2    NaN     0.0
90        NaN  201.0     0.0


In [67]:
# groups: groupby
grp = df1.groupby(by='Age')
print(grp)

print(grp.groups)


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f0450056760>
{1.0: Int64Index([0], dtype='int64'), 4.0: Int64Index([60], dtype='int64'), 11.0: Int64Index([10, 40, 70], dtype='int64'), 201.0: Int64Index([20, 30, 50, 90], dtype='int64')}


In [68]:
# count non-na values in Gender column, group wise
grp.Gender.count()

Age
1.0      1
4.0      1
11.0     2
201.0    4
Name: Gender, dtype: int64

In [69]:
# count non-na values for each col, group wise
grp.count()

Unnamed: 0_level_0,Name,Gender
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,1,1
4.0,1,1
11.0,3,2
201.0,3,4


In [70]:
# count no of elements/size in each group
grp.size()

Age
1.0      1
4.0      1
11.0     3
201.0    4
dtype: int64

In [71]:
# groups and aggregates

In [72]:
print(df1)

         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0
70       na-1   11.0     NaN
80       na-2    NaN     0.0
90        NaN  201.0     0.0


In [74]:
r  = df1.dropna()
print(r)

         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0


In [79]:
r  = df1.dropna(axis=1, thresh=10)
print(r)

Empty DataFrame
Columns: []
Index: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]


In [80]:
r  = df1.dropna(axis=1, thresh=8)
print(r)

         Name    Age  Gender
0    Gaurav-1    1.0     0.0
10  Abhishek1   11.0     0.0
20    Krishna  201.0     0.0
30  Abhishek2  201.0     0.0
40   Harshita   11.0     1.0
50       Joey  201.0     1.0
60     Shweta    4.0     1.0
70       na-1   11.0     NaN
80       na-2    NaN     0.0
90        NaN  201.0     0.0


In [83]:
df1.fillna({"Name":"Temp", "Age":df1.Age.mean()})

Unnamed: 0,Name,Age,Gender
0,Gaurav-1,1.0,0.0
10,Abhishek1,11.0,0.0
20,Krishna,201.0,0.0
30,Abhishek2,201.0,0.0
40,Harshita,11.0,1.0
50,Joey,201.0,1.0
60,Shweta,4.0,1.0
70,na-1,11.0,
80,na-2,93.555556,0.0
90,Temp,201.0,0.0


In [None]:
[1 1 2 NAN NAN NAN 2 3 ]

In [85]:
print(dir(df1))

['Age', 'Gender', 'Name', 'T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_wrap__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmu

In [84]:
df1.to_csv("Temp.csv")

In [86]:
print(dir(pd))

['BooleanDtype', 'Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NA', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseDtype', 'StringDtype', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_is_numpy_dev', '_lib', '_libs', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', '_np_version_under1p17', '_np_version_under1p18', '_testing', '_tslib', '_typing', '_versio

In [92]:
df = pd.read_csv("Temp.csv", index_col="Name")
print(df)

           Unnamed: 0    Age  Gender
Name                                
Gaurav-1            0    1.0     0.0
Abhishek1          10   11.0     0.0
Krishna            20  201.0     0.0
Abhishek2          30  201.0     0.0
Harshita           40   11.0     1.0
Joey               50  201.0     1.0
Shweta             60    4.0     1.0
na-1               70   11.0     NaN
na-2               80    NaN     0.0
NaN                90  201.0     0.0


In [95]:
df = pd.read_csv("Temp.csv", index_col=[0,1])
print(df)
print(df.index)

                Age  Gender
   Name                    
0  Gaurav-1     1.0     0.0
10 Abhishek1   11.0     0.0
20 Krishna    201.0     0.0
30 Abhishek2  201.0     0.0
40 Harshita    11.0     1.0
50 Joey       201.0     1.0
60 Shweta       4.0     1.0
70 na-1        11.0     NaN
80 na-2         NaN     0.0
90 NaN        201.0     0.0
MultiIndex([( 0,  'Gaurav-1'),
            (10, 'Abhishek1'),
            (20,   'Krishna'),
            (30, 'Abhishek2'),
            (40,  'Harshita'),
            (50,      'Joey'),
            (60,    'Shweta'),
            (70,      'na-1'),
            (80,      'na-2'),
            (90,         nan)],
           names=[None, 'Name'])


In [89]:
df = pd.read_csv("Temp.csv")
print(df)

   Unnamed: 0       Name    Age  Gender
0           0   Gaurav-1    1.0     0.0
1          10  Abhishek1   11.0     0.0
2          20    Krishna  201.0     0.0
3          30  Abhishek2  201.0     0.0
4          40   Harshita   11.0     1.0
5          50       Joey  201.0     1.0
6          60     Shweta    4.0     1.0
7          70       na-1   11.0     NaN
8          80       na-2    NaN     0.0
9          90        NaN  201.0     0.0
