<h1>basic data preparation</h1>
<h4>segment 1 : filtering and selecting data</h4>

In [182]:
# import library
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

<h4>selecting and retrieving data</h4>
<p>index value : label or integer index</p>

In [183]:
# declare in used variables
# size of stuff we would like to create
series_size = 8
df_size = 6

# define the function used for generating the index list
def index_generator(index_list, size, name) -> list:
	for i in range(size):
		index_list.append(name + ' ' + str(i + 1))
	return index_list

# create label index for a series
label_index = []
label_index = index_generator(label_index, series_size, 'row')

label_index

['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6', 'row 7', 'row 8']

In [184]:
# Series
# create a series name 'series_obj' with label index
series_obj = Series(np.arange(series_size), index=label_index)

# display the whole series
series_obj

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [185]:
# print the element specified by label index
print(series_obj['row 6'])
print(series_obj[['row 6', 'row 1']])

5
row 6    5
row 1    0
dtype: int64


In [186]:
# print the element specified by integer index
print(series_obj[0])
print(series_obj[7])
print(series_obj[[0, 7]])

0
7
row 1    0
row 8    7
dtype: int64


In [187]:
# DataFrame
# create label index for a dataframe
df_index_row = []
df_index_row = index_generator(df_index_row, df_size, 'row')
df_index_col = []
df_index_col = index_generator(df_index_col, df_size, 'column')

# create a DataFrame name 'df_obj'
df_obj = DataFrame(	data=np.random.rand(36).reshape(6, 6),
					index=df_index_row,
					columns=df_index_col
		)

# display the whole series
df_obj

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.172644,0.880976,0.630778,0.565046,0.102196,0.78114
row 2,0.844953,0.856448,0.787565,0.794914,0.341905,0.299681
row 3,0.266695,0.506362,0.96221,0.292804,0.913965,0.339136
row 4,0.993921,0.966338,0.048696,0.101195,0.32474,0.026012
row 5,0.454886,0.156419,0.282431,0.994324,0.169664,0.468198
row 6,0.333607,0.099506,0.874112,0.347901,0.081145,0.491077


In [188]:
# print the element specified by label index
print('[1, 1]')
print(df_obj.loc['row 1', 'column 1'])
print('\n')
print('entire row 1 and 5')
df_obj.loc[['row 1', 'row 5']]

[1, 1]
0.1726439237555324


entire row 1 and 5


Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.172644,0.880976,0.630778,0.565046,0.102196,0.78114
row 5,0.454886,0.156419,0.282431,0.994324,0.169664,0.468198


<h4>data slicing</h4>
<p>to select and return a slice of several values from a data set<br>
index with colon ':'
</p>

In [189]:
# data slicing with series
print('row 1 - 4')
print(series_obj['row 2':'row 7'])
print('\n')
print('row 1, 4, 7')
print(series_obj[['row 1', 'row 4', 'row 7']])

row 1 - 4
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
dtype: int64


row 1, 4, 7
row 1    0
row 4    3
row 7    6
dtype: int64


#### comparing w/ scalars

In [190]:
# series - to get comparison result (boolean)
series_obj < 2

row 1     True
row 2     True
row 3    False
row 4    False
row 5    False
row 6    False
row 7    False
row 8    False
dtype: bool

In [191]:
# dataframe - to get comparison result (boolean)
df_obj < .2

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,True,False,False,False,True,False
row 2,False,False,False,False,False,False
row 3,False,False,False,False,False,False
row 4,False,False,True,True,False,True
row 5,False,True,False,False,True,False
row 6,False,True,False,False,True,False


#### filter w/ scalars

In [192]:
# series - filter data
# series_obj[series_obj > 2] # singular condition
series_obj[(2 < series_obj) & (series_obj < 8)] # multiple conditions

row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [193]:
# dataframe - filter data
# df_obj[df_obj < .2] # singular condition
df_obj[(.2 < df_obj) & (df_obj < .8)]

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,,,0.630778,0.565046,,0.78114
row 2,,,0.787565,0.794914,0.341905,0.299681
row 3,0.266695,0.506362,,0.292804,,0.339136
row 4,,,,,0.32474,
row 5,0.454886,,0.282431,,,0.468198
row 6,0.333607,,,0.347901,,0.491077


#### setting values with scalars

In [194]:
# set series w/ new value
# series_obj[[0, 2, 4]] = 10
series_obj[['row 1', 'row 3', 'row 5']] = 20
series_obj

row 1    20
row 2     1
row 3    20
row 4     3
row 5    20
row 6     5
row 7     6
row 8     7
dtype: int64

In [195]:
# set df w/ new value
df_obj.loc['row 1', 'row 3', 'row 5'] = 20 # to set value to specific elements, identify w/ location always
# df_obj['row 1', 'row 3', 'row 5'] = 20 # to run this, you will get 1 new column
# df_obj[['row 1', 'row 3', 'row 5']] = 20 # to run this, you will get 3 new columns
# df_obj[[0, 2, 4]] = 10 # to run this, you will get 3 new columns
df_obj

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,20.0,20.0,20.0,20.0,20.0,20.0
row 2,0.844953,0.856448,0.787565,0.794914,0.341905,0.299681
row 3,20.0,20.0,20.0,20.0,20.0,20.0
row 4,0.993921,0.966338,0.048696,0.101195,0.32474,0.026012
row 5,20.0,20.0,20.0,20.0,20.0,20.0
row 6,0.333607,0.099506,0.874112,0.347901,0.081145,0.491077


<h4>segment 2 : treating missing values<br>
figuring out what data is missing</h4>

In [196]:
# set NaN (not a number) variable
missing = np.nan

In [197]:
# Series
# create missing-value series
missing_series_obj = Series(['row 1', missing, 'row 3', missing, 'row 5'])
missing_series_obj

0    row 1
1      NaN
2    row 3
3      NaN
4    row 5
dtype: object

In [198]:
# check if NaN value
missing_series_obj.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [199]:
# DataFrame
# create missing-value dataframe
missing_df_obj = DataFrame(data=np.random.rand(36).reshape(6,6))
missing_df_obj.loc[1:3, 2] = missing
missing_df_obj.loc[2:4, 5] = missing
missing_df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.40277,0.491169,0.894264,0.982796,0.585815,0.742233
1,0.344386,0.608874,,0.464905,0.001313,0.5387
2,0.973034,0.644816,,0.229654,0.829914,
3,0.770918,0.40298,,0.522905,0.383972,
4,0.84088,0.133336,0.960375,0.052009,0.390694,
5,0.388618,0.178984,0.647139,0.315485,0.437872,0.747649


In [200]:
# check if NaN value
missing_df_obj.isnull()

Unnamed: 0,0,1,2,3,4,5
0,False,False,False,False,False,False
1,False,False,True,False,False,False
2,False,False,True,False,False,True
3,False,False,True,False,False,True
4,False,False,False,False,False,True
5,False,False,False,False,False,False


<h4>filling in for missing values</h4>

In [201]:
# Series
# fill NaN w/ 0
filled_series_obj = missing_series_obj.fillna(0)
filled_series_obj

0    row 1
1        0
2    row 3
3        0
4    row 5
dtype: object

In [204]:
# DataFrame
# fill NaN w/ 0
# filled_df_obj = missing_df_obj.fillna(0)
filled_df_obj = missing_df_obj.fillna({0: 0.1, 2: 1.5, 5: 2.0}) # if specified location is no NaN, then do nothing
filled_df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.40277,0.491169,0.894264,0.982796,0.585815,0.742233
1,0.344386,0.608874,1.5,0.464905,0.001313,0.5387
2,0.973034,0.644816,1.5,0.229654,0.829914,2.0
3,0.770918,0.40298,1.5,0.522905,0.383972,2.0
4,0.84088,0.133336,0.960375,0.052009,0.390694,2.0
5,0.388618,0.178984,0.647139,0.315485,0.437872,0.747649
