In [2]:
import numpy as np
import pandas as pd

In [7]:
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
 [NA, NA, NA], [NA, 6.5, 3.]])
print("Does the data contain missing values\n" , data.isnull())
print("\nAfter drop na :")
data.dropna()

Does the data contain missing values
        0      1      2
0  False  False  False
1  False   True   True
2   True   True   True
3   True  False  False

After drop na :


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
# Filling using dict
data.fillna({0: 1, 1: 0, 2: 2})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,2.0
2,1.0,0.0,2.0
3,1.0,6.5,3.0


In [11]:
# Filling data using Mean
data.fillna(data.mean() , inplace=True)
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,6.5,3.0


In [12]:
# Removing Duplicates
data1 = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
 'k2': [1, 1, 2, 3, 3, 4, 4]})
data1.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [13]:
data1.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [18]:
# Data Transformation
data3 = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
 'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

data3['animal'] = data3['food'].map(lambda x:meat_to_animal[x.lower()])
data3

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [22]:
# Replacing Values
data4 = pd.Series([1., -999., 2., -999., -1000., 3.])
print("Original Data : \n", data4)
print("After Replacing")
data4.replace({-999:0 , 3. : 5})

Original Data : 
 0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
After Replacing


0       1.0
1       0.0
2       2.0
3       0.0
4   -1000.0
5       5.0
dtype: float64

In [3]:
# Detecting & Removing Outliers
data5 = pd.DataFrame(np.random.randn(1000, 4))
data5.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.005119,0.019044,0.009344,0.016673
std,1.003242,1.022847,1.008042,0.997766
min,-3.034324,-2.965835,-3.373579,-3.459404
25%,-0.65743,-0.685731,-0.649806,-0.70497
50%,0.010572,0.06889,-0.010193,0.018455
75%,0.695364,0.676368,0.626046,0.720594
max,3.347514,3.937702,3.042167,2.980927


In [6]:
data5.head()


Unnamed: 0,0,1,2,3
0,0.767847,1.079909,1.117896,-2.395282
1,0.118231,-0.236407,-1.04822,-1.08469
2,-0.597724,0.244232,-0.133445,-0.908851
3,0.564217,-1.589463,-1.418664,-0.216336
4,1.130037,-1.027857,0.047416,0.157828


In [13]:
data5[(np.abs(data5) > 3).any(1)] # Selects all rows that exceed -3 or 3
data5[(np.abs(data5) > 3).any(1)]  = np.sign(data5)*3 # if positive value then np.sign produces +1 and vice versa
print("----- After -------")
data5.describe()

----- After -------


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.009844,0.026894,0.00279,0.001615
std,1.045191,1.059529,1.037704,1.048889
min,-3.0,-3.0,-3.0,-3.0
25%,-0.661674,-0.688994,-0.653053,-0.715914
50%,0.010572,0.072695,-0.010193,0.018455
75%,0.706046,0.691511,0.626046,0.726083
max,3.0,3.0,3.0,3.0
