In [1]:
import numpy as np
import pandas as pd

In [4]:
d = {
    'A':[1, 2, 3, np.nan],
    'B':[10, np.nan, np.nan, np.nan],
    'C':[100, 200, 300, 400],
    'D':[np.nan, np.nan, np.nan, np.nan]
}

df = pd.DataFrame(d)

In [5]:
df

Unnamed: 0,A,B,C,D
0,1.0,10.0,100,
1,2.0,,200,
2,3.0,,300,
3,,,400,


In [6]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,True
1,False,True,False,True
2,False,True,False,True
3,True,True,False,True


In [7]:
df.isnull().sum()

A    1
B    3
C    0
D    4
dtype: int64

In [8]:
df.dropna()

Unnamed: 0,A,B,C,D


In [10]:
df.dropna(axis=1)

Unnamed: 0,C
0,100
1,200
2,300
3,400


In [11]:
df.dropna(axis=1, how='all')

Unnamed: 0,A,B,C
0,1.0,10.0,100
1,2.0,,200
2,3.0,,300
3,,,400


In [12]:
df.dropna(axis=1, thresh = 2)

Unnamed: 0,A,C
0,1.0,100
1,2.0,200
2,3.0,300
3,,400


In [13]:
df.fillna("FILL")

Unnamed: 0,A,B,C,D
0,1.0,10.0,100,FILL
1,2.0,FILL,200,FILL
2,3.0,FILL,300,FILL
3,FILL,FILL,400,FILL


In [14]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,10.0,100,
1,2.0,10.0,200,
2,3.0,10.0,300,
3,2.0,10.0,400,


# Grouping

In [15]:
d = {
    'Company':['FB', 'GOOGLE', 'MICROSOFT', 'GOOGLE', 'FB', 'FB'],
    'Employee':['A', 'B', 'C', 'D', 'E', 'F'],
    'Sales':[100, 200, 300, 400, 500, 600]
}

df = pd.DataFrame(d)

In [16]:
df

Unnamed: 0,Company,Employee,Sales
0,FB,A,100
1,GOOGLE,B,200
2,MICROSOFT,C,300
3,GOOGLE,D,400
4,FB,E,500
5,FB,F,600


In [17]:
df['Sales'].max()

600

In [18]:
df['Sales'].min()

100

In [19]:
df['Sales'].mean()

350.0

In [20]:
df.describe()

Unnamed: 0,Sales
count,6.0
mean,350.0
std,187.082869
min,100.0
25%,225.0
50%,350.0
75%,475.0
max,600.0


In [22]:
grouped_df = df.groupby('Company')

In [23]:
grouped_df.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,400.0
GOOGLE,300.0
MICROSOFT,300.0


In [24]:
grouped_df.max()

Unnamed: 0_level_0,Employee,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,F,600
GOOGLE,D,400
MICROSOFT,C,300


In [25]:
grouped_df.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,3.0,400.0,264.575131,100.0,300.0,500.0,550.0,600.0
GOOGLE,2.0,300.0,141.421356,200.0,250.0,300.0,350.0,400.0
MICROSOFT,1.0,300.0,,300.0,300.0,300.0,300.0,300.0


In [26]:
df['Company'].unique()

array(['FB', 'GOOGLE', 'MICROSOFT'], dtype=object)

In [27]:
df['Company'].nunique()

3

# Custom Functions

In [28]:
df

Unnamed: 0,Company,Employee,Sales
0,FB,A,100
1,GOOGLE,B,200
2,MICROSOFT,C,300
3,GOOGLE,D,400
4,FB,E,500
5,FB,F,600


In [29]:
def give_bonus(sales):
    return sales + 100

In [30]:
df['Sales'].apply(give_bonus)

0    200
1    300
2    400
3    500
4    600
5    700
Name: Sales, dtype: int64

In [32]:
df['New Sales'] = df['Sales'].apply(lambda sales : sales + 100)

In [33]:
df

Unnamed: 0,Company,Employee,Sales,New Sales
0,FB,A,100,200
1,GOOGLE,B,200,300
2,MICROSOFT,C,300,400
3,GOOGLE,D,400,500
4,FB,E,500,600
5,FB,F,600,700
