In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

#Dealing with Missing Data

In [None]:
x = np.array([1,2,3,4,5])

In [None]:
x.sum()

15

In [None]:
print(x.dtype)

int64


In [None]:
#Missing Data
x = np.array([1,2,3,'--' ,5])

In [None]:
print(x.dtype)

<U21


In [None]:
#Standard way to represent missing data....
x = np.array([1,2,3,None,5])

In [None]:
x.sum()
#throws error........

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [None]:
# nan -> means here is something but not a number......
x = np.array([1,2,3,np.nan ,5])

In [None]:
x.sum()
#does not prints right result......

nan

In [None]:
#need solution..
x_b = np.array([True, True, True, False, True])

In [None]:
x[x_b].sum()

11.0

In [None]:
x[x_b].mean()

2.75

In [None]:
#Instead of creating of its own, there is numpy sub-module...
# masked array....

m_x = np.ma.masked_array(x, mask = [0,0,0,1,0])

In [None]:
m_x.sum()

11.0

In [None]:
m_x.mean()

2.75

#Dealing with Missing Values

In [None]:
df = pd.read_csv('rooms.csv')

In [None]:
df.head()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,Empty,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N


In [None]:
df.dtypes

Room_Number     float64
Num_Students     object
Department       object
Occupied         object
dtype: object

In [None]:
%timeit np.arange(10000, dtype="int").sum()

11.4 µs ± 3.06 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [None]:
%timeit np.arange(10000, dtype="object").sum()

708 µs ± 177 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
df.Room_Number.isnull()
#true means missing values

0    False
1    False
2    False
3    False
4    False
5     True
6    False
7    False
8    False
9    False
Name: Room_Number, dtype: bool

In [None]:
# now to find number of times missing value occured.....
df.Room_Number.isnull().sum()
# results 1 missing value.....

1

In [None]:
df.isnull()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,False,False,False,False
1,False,True,False,False
2,False,False,False,False
3,False,False,False,False
4,False,True,False,False
5,True,False,False,False
6,False,False,False,False
7,False,True,False,False
8,False,False,False,True
9,False,False,False,False


In [None]:
df.isnull().sum()
# sums up missing values in dataset....

Room_Number     1
Num_Students    3
Department      0
Occupied        1
dtype: int64

In [None]:
missing_values = ["NA", "n/a", "na"]

In [None]:
df = pd.read_csv("rooms.csv",
                 na_values = missing_values)

In [None]:
df.isnull()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,False,False,False,False
1,False,True,False,False
2,False,False,False,False
3,False,False,False,False
4,False,True,False,False
5,True,False,False,False
6,False,False,False,False
7,False,True,False,False
8,False,True,False,True
9,False,False,False,False


In [None]:
df.Num_Students.sum()

12.0

In [None]:
df.Num_Students.mean()

2.0

In [None]:
missing_values = ["NA", "n/a", "na", "Empty", "--"]

In [None]:
df = pd.read_csv("rooms.csv",
                 na_values = missing_values)

In [None]:
df.isnull()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,False,False,False,False
1,False,True,True,False
2,False,False,False,False
3,False,False,False,False
4,False,True,False,False
5,True,False,False,False
6,False,False,False,True
7,False,True,False,False
8,False,True,False,True
9,False,False,False,False


In [None]:
# uniue () -> gives the list of missing abstracts ....
df.Department.unique()

array(['Mechanical', nan, 'Electrical', 'Chemical', 'Civil', 'CS'],
      dtype=object)

In [None]:
df.Occupied.unique()

array(['Y', 'N', nan], dtype=object)

In [None]:
df.isnull()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,False,False,False,False
1,False,True,True,False
2,False,False,False,False
3,False,False,False,False
4,False,True,False,False
5,True,False,False,False
6,False,False,False,True
7,False,True,False,False
8,False,True,False,True
9,False,False,False,False


In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,
7,108.0,,CS,Y
8,109.0,,Mechanical,
9,110.0,2.0,CS,N


In [None]:
missing_values = ["NA", "n/a", "na", "Empty", "--", "NaN"]

In [None]:
df = pd.read_csv("rooms.csv",
                 na_values = missing_values)

In [None]:
df.isnull()

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,False,False,False,False
1,False,True,True,False
2,False,False,False,False
3,False,False,False,False
4,False,True,False,False
5,True,False,False,False
6,False,False,False,True
7,False,True,False,False
8,False,True,False,True
9,False,False,False,False


In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,
7,108.0,,CS,Y
8,109.0,,Mechanical,
9,110.0,2.0,CS,N


In [None]:
df.Occupied.fillna("N", inplace=True)

In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,
7,108.0,,CS,Y
8,109.0,,Mechanical,
9,110.0,2.0,CS,N


In [None]:
def covert_to_binary(v):
    if v == "Y":
        return True
    else:
        return False

In [None]:
df.Occupied = df.Occupied.apply(convert_to_binary)

NameError: name 'convert_to_binary' is not defined

In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied
0,101.0,1.0,Mechanical,Y
1,102.0,,,N
2,103.0,3.0,Electrical,Y
3,104.0,2.0,Mechanical,Y
4,105.0,,Chemical,N
5,,1.0,Electrical,Y
6,107.0,3.0,Civil,
7,108.0,,CS,Y
8,109.0,,Mechanical,
9,110.0,2.0,CS,N


In [None]:
df["Dept2"] = df.Department

In [None]:
df.Department.fillna(method="ffill", inplace =True)
# or
df.Department.fillna(method="pad", inplace =True)


In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied,Dept2
0,101.0,1.0,Mechanical,Y,Mechanical
1,102.0,,Mechanical,N,
2,103.0,3.0,Electrical,Y,Electrical
3,104.0,2.0,Mechanical,Y,Mechanical
4,105.0,,Chemical,N,Chemical
5,,1.0,Electrical,Y,Electrical
6,107.0,3.0,Civil,,Civil
7,108.0,,CS,Y,CS
8,109.0,,Mechanical,,Mechanical
9,110.0,2.0,CS,N,CS


In [None]:
df.Department.fillna(method="bfill", inplace =True)


In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied,Dept2
0,101.0,1.0,Mechanical,Y,Mechanical
1,102.0,,Mechanical,N,
2,103.0,3.0,Electrical,Y,Electrical
3,104.0,2.0,Mechanical,Y,Mechanical
4,105.0,,Chemical,N,Chemical
5,,1.0,Electrical,Y,Electrical
6,107.0,3.0,Civil,,Civil
7,108.0,,CS,Y,CS
8,109.0,,Mechanical,,Mechanical
9,110.0,2.0,CS,N,CS


In [None]:
df.Num_Students.fillna(df.Num_Students.median(), inplace = True)

In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied,Dept2
0,101.0,1.0,Mechanical,Y,Mechanical
1,102.0,2.0,Mechanical,N,
2,103.0,3.0,Electrical,Y,Electrical
3,104.0,2.0,Mechanical,Y,Mechanical
4,105.0,2.0,Chemical,N,Chemical
5,,1.0,Electrical,Y,Electrical
6,107.0,3.0,Civil,,Civil
7,108.0,2.0,CS,Y,CS
8,109.0,2.0,Mechanical,,Mechanical
9,110.0,2.0,CS,N,CS


In [None]:
#interpolate -> fills the missing number within the range of
# number we have ...........
df.Room_Number.interpolate()

0    101.0
1    102.0
2    103.0
3    104.0
4    105.0
5    106.0
6    107.0
7    108.0
8    109.0
9    110.0
Name: Room_Number, dtype: float64

In [None]:
df.Room_Number.interpolate(inplace = True)

In [None]:
df

Unnamed: 0,Room_Number,Num_Students,Department,Occupied,Dept2
0,101.0,1.0,Mechanical,Y,Mechanical
1,102.0,2.0,Mechanical,N,
2,103.0,3.0,Electrical,Y,Electrical
3,104.0,2.0,Mechanical,Y,Mechanical
4,105.0,2.0,Chemical,N,Chemical
5,106.0,1.0,Electrical,Y,Electrical
6,107.0,3.0,Civil,,Civil
7,108.0,2.0,CS,Y,CS
8,109.0,2.0,Mechanical,,Mechanical
9,110.0,2.0,CS,N,CS
