# Handling_Missing_data

In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull() # isnull is used to detect the missing values

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
#The built-in Python None value is also treated as NA in object arrays.
string_data[0] = None


In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# Filtering Out Missing Data

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [9]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [12]:
data[data.notnull()] 

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [14]:
# Data Frame
data1 = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])

In [15]:
data1

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
data1.dropna(how= "all") # dropna drop all these rows which have missing value.

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
data1.dropna(axis= 1) # for column

0
1
2
3


In [19]:
data1[3] = NA

In [20]:
data1

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data1.dropna(how= "all",axis= 1)# for column
#data1.dropna(how= "all",axis= 0) for rows
# Passing how='all' will only drop rows that are all NA.for column you must sepecify the axis= 1


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
df = pd.DataFrame(np.random.randn(7, 3))

In [23]:
df

Unnamed: 0,0,1,2
0,-0.623166,0.173243,0.300212
1,-0.961052,-0.17012,1.937631
2,-0.871846,-1.554725,1.534997
3,0.175018,-0.784519,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [24]:
df.iloc[:4,1] = NA

In [25]:
df.iloc[:2,2] = NA

In [26]:
df

Unnamed: 0,0,1,2
0,-0.623166,,
1,-0.961052,,
2,-0.871846,,1.534997
3,0.175018,,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [27]:
df.dropna(thresh= 3) # thresh = 3   means keep only the rows with atleast 3 non_NA values.

Unnamed: 0,0,1,2
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [28]:
df.dropna(thresh= 4,axis = 1)

Unnamed: 0,0,2
0,-0.623166,
1,-0.961052,
2,-0.871846,1.534997
3,0.175018,-1.390859
4,-0.456819,0.482611
5,0.771615,0.674679
6,1.157625,1.294486


In [29]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [30]:
df.dropna(subset =[1,2])

Unnamed: 0,0,1,2
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [31]:
 df1 = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                    "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                            pd.NaT]})


In [32]:
df1

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [33]:
df1.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [34]:
df1.dropna(subset=["toy","born"])

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


# FILLING_IN_MISSING_DATA

In [35]:
df.fillna(0)# replace NaN VALUES with 0.

Unnamed: 0,0,1,2
0,-0.623166,0.0,0.0
1,-0.961052,0.0,0.0
2,-0.871846,0.0,1.534997
3,0.175018,0.0,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [37]:
df

Unnamed: 0,0,1,2
0,-0.623166,,
1,-0.961052,,
2,-0.871846,,1.534997
3,0.175018,,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [36]:
# method ="ffill"means foward filling missing values. 

df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,-0.623166,,
1,-0.961052,,
2,-0.871846,,1.534997
3,0.175018,,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [38]:
df.fillna(method="bfill") #method = "bfill" means backward filling missing values.

Unnamed: 0,0,1,2
0,-0.623166,-0.398264,1.534997
1,-0.961052,-0.398264,1.534997
2,-0.871846,-0.398264,1.534997
3,0.175018,-0.398264,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [39]:
values = {1:4,2:3}
df.fillna(value=values) #replace all missing values in column 1,2 with 4,3.

Unnamed: 0,0,1,2
0,-0.623166,4.0,3.0
1,-0.961052,4.0,3.0
2,-0.871846,4.0,1.534997
3,0.175018,4.0,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [40]:
df

Unnamed: 0,0,1,2
0,-0.623166,,
1,-0.961052,,
2,-0.871846,,1.534997
3,0.175018,,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


In [41]:
 df.fillna(value=values, limit=2)# limit = 2 means only replace the first two missing values.


Unnamed: 0,0,1,2
0,-0.623166,4.0,3.0
1,-0.961052,4.0,3.0
2,-0.871846,,1.534997
3,0.175018,,-1.390859
4,-0.456819,-0.398264,0.482611
5,0.771615,0.899544,0.674679
6,1.157625,0.165284,1.294486


# 7.2 :  Data_Transformation

# removing_duplicate

In [42]:
data2 = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two','two'],
'k2': [1, 1, 2, 3, 3,4, 4,4 ]})


In [43]:
data2

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4
7,two,4


In [44]:
data2.duplicated(keep= "last") 
# Return boolean Series denoting duplicate rows, optionally only
#considering certain columns. keep = "first" by default

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7    False
dtype: bool

In [45]:
data2.duplicated(keep=False)

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
dtype: bool

In [46]:
data2.drop_duplicates() # jitni values repeat ho gae saab ko drop kr dy ga.

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [47]:
data2["val"] = np.arange(8)

In [48]:
data2


Unnamed: 0,k1,k2,val
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6
7,two,4,7


Suppose we had an additional column
of values and wanted to filter duplicates only based on the 'k1' column.

In [49]:
data2.drop_duplicates(["k1"])

Unnamed: 0,k1,k2,val
0,one,1,0
1,two,1,1


In [50]:
data2.drop_duplicates(["k2"])

Unnamed: 0,k1,k2,val
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


# Transforming Data Using a Function or Mapping

# Replacing Values:

In [51]:
data3 = pd.Series([1., -999., 2., -999., -1000., 3.])

In [52]:
data3

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [53]:
data3.replace(-999,np.nan) 

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [54]:
data3

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [None]:
data3.replace([-999,-1000],np.nan)

In [None]:
data3.replace([-999,-1000],[np.nan,4])

In [None]:
data3.replace([1,2], method = "bfill")

In [None]:
data3.replace({0:10,1:10})

In [55]:
pd.qcut()

TypeError: qcut() missing 2 required positional arguments: 'x' and 'q'

In [56]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler


array([0, 2, 1, 3, 4])

In [57]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [58]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19


In [63]:
df.sample(n=4)

Unnamed: 0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15
