<a href="https://colab.research.google.com/github/harita-gr/AI_ML_Practice/blob/main/Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segment 1 - Filtering & Selecting Data

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

## Selecting and Retrieving data
You can write an index value in 2 forms

*   Label Index  or
*   Integer index



In [4]:
series_obj = Series(np.arange(8), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6', 'row 7', 'row 8' ])
series_obj

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [5]:
#label index
series_obj['row 7']

6

In [6]:
#integer index
series_obj[6]

6

In [8]:
series_obj[[0, 7]]

row 1    0
row 8    7
dtype: int64

Demo:
Generate DF of 36 random numbers

*   6x6 shape
*   label rows and cols


      

In [9]:
np.random.seed(25) # seed inorder to get the same random nums as generated in the tutorial

DF_obj = DataFrame(np.random.rand(36).reshape((6, 6)),
                   index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'],
                   columns=['col 1', 'col 2', 'col 3', 'col 4', 'col 5', 'col 6'])
DF_obj

Unnamed: 0,col 1,col 2,col 3,col 4,col 5,col 6
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 5,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [10]:
# to get labels at the precice indexes
DF_obj.loc[['row 2', 'row 5'],['col 2', 'col 5']]

Unnamed: 0,col 2,col 5
row 2,0.437611,0.402366
row 5,0.559053,0.421004


## Data Slicing

In [11]:
series_obj['row 3': 'row 7']

row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
dtype: int64

## Comparing with Scalars

Scalar - single numerical value

In [12]:
DF_obj < .2
#returns DF of boolean values.

#Eg: row1 col1 = 0.8 > 0.2 => FALSE

Unnamed: 0,col 1,col 2,col 3,col 4,col 5,col 6
row 1,False,False,False,True,False,True
row 2,False,False,False,False,False,True
row 3,False,False,True,False,False,False
row 4,False,False,False,False,False,False
row 5,False,False,True,False,False,False
row 6,False,False,False,False,False,False


## Filtering with Scalars

In [13]:
#return only value > 6
series_obj[series_obj > 6]

row 8    7
dtype: int64

## Setting values with scalars

In [19]:
series_obj[['row 1', 'row 2']] = 8
series_obj

row 1    8
row 2    8
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

# Segment 2 - Treating Missing Values

Figure out what data is missing

In [21]:
missing = np.nan

series_obj_2 = Series(['row 1', missing, 'row 3', 'row 4', 'row 5', 'row 6', missing, 'row 8'])
series_obj_2

0    row 1
1      NaN
2    row 3
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [22]:
series_obj_2.isnull()

0    False
1     True
2    False
3    False
4    False
5    False
6     True
7    False
dtype: bool

Filling in for missing values

In [23]:
np.random.seed(25)

DF_obj_2 = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj_2

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [24]:
#setting some value as missing
DF_obj_2.loc[3:5, 0] = missing
DF_obj_2.loc[1:4, 5] = missing
DF_obj_2

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [25]:
filled_DF = DF_obj_2.fillna(0) #fill missing values with 0
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [28]:
#using dictionary fill missing col values
#fill missing values in col 0 with 0.1, col 5 with 1.25

filled_DF = DF_obj_2.fillna({0:0.1, 5:1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,1.25
2,0.447031,0.585445,0.161985,0.520719,0.326051,1.25
3,0.1,0.836375,0.481343,0.516502,0.383048,1.25
4,0.1,0.559053,0.03445,0.71993,0.421004,1.25
5,0.1,0.900274,0.669612,0.456069,0.289804,0.525819


In [30]:
filled_DF = DF_obj_2.fillna(method="ffill") # fill forward (last non-null element )
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


Counting missing values

Use: for getting summary statistics of your DF

In [31]:
np.random.seed(25)

DF_obj_3 = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj_3.loc[3:5, 0] = missing
DF_obj_3.loc[1:4, 5] = missing
DF_obj_3

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [32]:
DF_obj_3.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

Filtering out missing values

In [33]:
DF_no_NaN =  DF_obj_3.dropna() #rows are dropped
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [34]:
DF_no_NaN =  DF_obj_3.dropna(axis=1) #cols are dropped
DF_no_NaN

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804
