<h1>basic data preparation</h1>
<h4>segment 1 : filtering and selecting data</h4>

In [215]:
# import library
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

<h4>selecting and retrieving data</h4>
<p>index value : label or integer index</p>

In [216]:
# declare in used variables
# size of stuff we would like to create
series_size = 8
df_size = 6

# define the function used for generating the index list
def index_generator(index_list, size, name) -> list:
	for i in range(size):
		index_list.append(name + ' ' + str(i + 1))
	return index_list

# create label index for a series
label_index = []
label_index = index_generator(label_index, series_size, 'row')

label_index

['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6', 'row 7', 'row 8']

In [217]:
# Series
# create a series name 'series_obj' with label index
series_obj = Series(np.arange(series_size), index=label_index)

# display the whole series
series_obj

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [218]:
# print the element specified by label index
print(series_obj['row 6'])
print(series_obj[['row 6', 'row 1']])

5
row 6    5
row 1    0
dtype: int64


In [219]:
# print the element specified by integer index
print(series_obj[0])
print(series_obj[7])
print(series_obj[[0, 7]])

0
7
row 1    0
row 8    7
dtype: int64


In [220]:
# DataFrame
# create label index for a dataframe
df_index_row = []
df_index_row = index_generator(df_index_row, df_size, 'row')
df_index_col = []
df_index_col = index_generator(df_index_col, df_size, 'column')

# create a DataFrame name 'df_obj'
df_obj = DataFrame(	data=np.random.rand(36).reshape(6, 6),
					index=df_index_row,
					columns=df_index_col
		)

# display the whole series
df_obj

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.811191,0.828801,0.735573,0.734083,0.22932,0.801979
row 2,0.78227,0.914135,0.270392,0.490882,0.867356,0.687255
row 3,0.655433,0.323452,0.132734,0.767855,0.691631,0.131161
row 4,0.693211,0.016726,0.034111,0.852397,0.890409,0.059667
row 5,0.737218,0.115858,0.880181,0.617847,0.152625,0.915049
row 6,0.200163,0.218864,0.375681,0.647134,0.972965,0.324497


In [221]:
# print the element specified by label index
print('[1, 1]')
print(df_obj.loc['row 1', 'column 1'])
print('\n')
print('entire row 1 and 5')
df_obj.loc[['row 1', 'row 5']]

[1, 1]
0.8111912227430844


entire row 1 and 5


Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.811191,0.828801,0.735573,0.734083,0.22932,0.801979
row 5,0.737218,0.115858,0.880181,0.617847,0.152625,0.915049


<h4>data slicing</h4>
<p>to select and return a slice of several values from a data set<br>
index with colon ':'
</p>

In [222]:
# data slicing with series
print('row 1 - 4')
print(series_obj['row 2':'row 7'])
print('\n')
print('row 1, 4, 7')
print(series_obj[['row 1', 'row 4', 'row 7']])

row 1 - 4
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
dtype: int64


row 1, 4, 7
row 1    0
row 4    3
row 7    6
dtype: int64


#### comparing w/ scalars

In [223]:
# series - to get comparison result (boolean)
series_obj < 2

row 1     True
row 2     True
row 3    False
row 4    False
row 5    False
row 6    False
row 7    False
row 8    False
dtype: bool

In [224]:
# dataframe - to get comparison result (boolean)
df_obj < .2

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,False,False,False,False,False,False
row 2,False,False,False,False,False,False
row 3,False,False,True,False,False,True
row 4,False,True,True,False,False,True
row 5,False,True,False,False,True,False
row 6,False,False,False,False,False,False


#### filter w/ scalars

In [225]:
# series - filter data
# series_obj[series_obj > 2] # singular condition
series_obj[(2 < series_obj) & (series_obj < 8)] # multiple conditions

row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [226]:
# dataframe - filter data
# df_obj[df_obj < .2] # singular condition
df_obj[(.2 < df_obj) & (df_obj < .8)]

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,,,0.735573,0.734083,0.22932,
row 2,0.78227,,0.270392,0.490882,,0.687255
row 3,0.655433,0.323452,,0.767855,0.691631,
row 4,0.693211,,,,,
row 5,0.737218,,,0.617847,,
row 6,0.200163,0.218864,0.375681,0.647134,,0.324497


#### setting values with scalars

In [227]:
# set series w/ new value
# series_obj[[0, 2, 4]] = 10
series_obj[['row 1', 'row 3', 'row 5']] = 20
series_obj

row 1    20
row 2     1
row 3    20
row 4     3
row 5    20
row 6     5
row 7     6
row 8     7
dtype: int64

In [228]:
# set df w/ new value
df_obj.loc['row 1', 'row 3', 'row 5'] = 20 # to set value to specific elements, identify w/ location always
# df_obj['row 1', 'row 3', 'row 5'] = 20 # to run this, you will get 1 new column
# df_obj[['row 1', 'row 3', 'row 5']] = 20 # to run this, you will get 3 new columns
# df_obj[[0, 2, 4]] = 10 # to run this, you will get 3 new columns
df_obj

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,20.0,20.0,20.0,20.0,20.0,20.0
row 2,0.78227,0.914135,0.270392,0.490882,0.867356,0.687255
row 3,20.0,20.0,20.0,20.0,20.0,20.0
row 4,0.693211,0.016726,0.034111,0.852397,0.890409,0.059667
row 5,20.0,20.0,20.0,20.0,20.0,20.0
row 6,0.200163,0.218864,0.375681,0.647134,0.972965,0.324497


<h4>segment 2 : treating missing values<br>
figuring out what data is missing</h4>

In [229]:
# set NaN (not a number) variable
missing = np.nan

In [230]:
# Series
# create missing-value series
missing_series_obj = Series(['row 1', missing, 'row 3', missing, 'row 5'])
missing_series_obj

0    row 1
1      NaN
2    row 3
3      NaN
4    row 5
dtype: object

In [231]:
# check if NaN value
missing_series_obj.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [232]:
# DataFrame
# create missing-value dataframe
missing_df_obj = DataFrame(data=np.random.rand(36).reshape(6,6))
missing_df_obj.loc[1:3, 2] = missing
missing_df_obj.loc[2:4, 5] = missing
missing_df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.383228,0.972508,0.488229,0.008052,0.48988,0.100984
1,0.921311,0.376024,,0.873828,0.728943,0.666155
2,0.59304,0.77566,,0.039176,0.479009,
3,0.772358,0.662708,,0.754837,0.511035,
4,0.463479,0.278545,0.467554,0.221994,0.576183,
5,0.286487,0.131011,0.664528,0.501339,0.510629,0.135533


In [233]:
# check if NaN value
missing_df_obj.isnull()

Unnamed: 0,0,1,2,3,4,5
0,False,False,False,False,False,False
1,False,False,True,False,False,False
2,False,False,True,False,False,True
3,False,False,True,False,False,True
4,False,False,False,False,False,True
5,False,False,False,False,False,False


<h4>filling in for missing values</h4>

In [234]:
# Series
# fill NaN w/ 0
filled_series_obj = missing_series_obj.fillna(0)
filled_series_obj

0    row 1
1        0
2    row 3
3        0
4    row 5
dtype: object

In [235]:
# fill with method (forward fill)
# it will take the data in the prior row and fill in NaN
filled_series_obj = missing_series_obj.fillna(method='ffill')
filled_series_obj

0    row 1
1    row 1
2    row 3
3    row 3
4    row 5
dtype: object

In [236]:
# DataFrame
# fill NaN w/ 0
# filled_df_obj = missing_df_obj.fillna(0)
filled_df_obj = missing_df_obj.fillna({0: 0.1, 2: 1.5, 5: 2.0}) # if specified location is no NaN, then do nothing
filled_df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.383228,0.972508,0.488229,0.008052,0.48988,0.100984
1,0.921311,0.376024,1.5,0.873828,0.728943,0.666155
2,0.59304,0.77566,1.5,0.039176,0.479009,2.0
3,0.772358,0.662708,1.5,0.754837,0.511035,2.0
4,0.463479,0.278545,0.467554,0.221994,0.576183,2.0
5,0.286487,0.131011,0.664528,0.501339,0.510629,0.135533


In [237]:
# fill with method (forward fill)
filled_df_obj = missing_df_obj.fillna(method='ffill')
filled_df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.383228,0.972508,0.488229,0.008052,0.48988,0.100984
1,0.921311,0.376024,0.488229,0.873828,0.728943,0.666155
2,0.59304,0.77566,0.488229,0.039176,0.479009,0.666155
3,0.772358,0.662708,0.488229,0.754837,0.511035,0.666155
4,0.463479,0.278545,0.467554,0.221994,0.576183,0.666155
5,0.286487,0.131011,0.664528,0.501339,0.510629,0.135533


<h4>counting missing values<h4>

In [238]:
# Series
print(missing_series_obj)
missing_series_obj.isnull().sum()

0    row 1
1      NaN
2    row 3
3      NaN
4    row 5
dtype: object


2

In [239]:
# DataFrame
print(missing_df_obj)
missing_df_obj.isnull().sum()

          0         1         2         3         4         5
0  0.383228  0.972508  0.488229  0.008052  0.489880  0.100984
1  0.921311  0.376024       NaN  0.873828  0.728943  0.666155
2  0.593040  0.775660       NaN  0.039176  0.479009       NaN
3  0.772358  0.662708       NaN  0.754837  0.511035       NaN
4  0.463479  0.278545  0.467554  0.221994  0.576183       NaN
5  0.286487  0.131011  0.664528  0.501339  0.510629  0.135533


0    0
1    0
2    3
3    0
4    0
5    3
dtype: int64

<h4>filtering out missing values</h4>

In [240]:
# Series
# filter out NaN
series_no_nan = missing_series_obj.dropna()
series_no_nan

0    row 1
2    row 3
4    row 5
dtype: object

In [241]:
# Dataframe
# filter out NaN
df_no_nan = missing_df_obj.dropna() # row is default
df_no_nan

Unnamed: 0,0,1,2,3,4,5
0,0.383228,0.972508,0.488229,0.008052,0.48988,0.100984
5,0.286487,0.131011,0.664528,0.501339,0.510629,0.135533


In [242]:
# filter out NaN
df_no_nan = missing_df_obj.dropna(axis=1)
df_no_nan

Unnamed: 0,0,1,3,4
0,0.383228,0.972508,0.008052,0.48988
1,0.921311,0.376024,0.873828,0.728943
2,0.59304,0.77566,0.039176,0.479009
3,0.772358,0.662708,0.754837,0.511035
4,0.463479,0.278545,0.221994,0.576183
5,0.286487,0.131011,0.501339,0.510629


<h4>segment 3 : removing duplicates<br>
removing duplicates</h4>

In [250]:
# DataFrame
# crete duplicated dataframe

duplicated_df_obj = DataFrame({
	'column 1': [1, 1, 2, 2, 3 ,3, 3],
	'column 2': ['a', 'a', 'b', 'b', 'c', 'c', 'c'],
	'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'F']
})
duplicated_df_obj

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
1,1,a,A
2,2,b,B
3,2,b,B
4,3,c,C
5,3,c,C
6,3,c,F


In [251]:
# list duplicate rows
duplicated_df_obj.duplicated()

0    False
1     True
2    False
3     True
4    False
5     True
6    False
dtype: bool

In [252]:
# drop duplicate in rows
non_dup_df = duplicated_df_obj.drop_duplicates()
non_dup_df

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
2,2,b,B
4,3,c,C
6,3,c,F


In [254]:
# drop duplicate w/ specification
non_dup_df = duplicated_df_obj.drop_duplicates('column 2')
non_dup_df

Unnamed: 0,column 1,column 2,column 3
0,1,a,A
2,2,b,B
4,3,c,C
