In [1]:
import numpy as np
import pandas as pd

# DataFrame 인덱스 설정 및 제거 set_index(), reset_index() 메서드

In [2]:
# set_indesx() : 기존의 row index를 제거하고 DataFrame 객체의
# column 중 하나를 인덱스로 설정

# DataFrame.set_index(keys, *, drop = True, append = False, 
#                     inplace = False, verify_integrity = False)

# reset_index : 기존의 row index를 DataFrame 객체의
# column으로 추가한다. 그리고 RangeIndex로 순번을 새롭게 매긴다.

# DataFrame.reset_index(level = None, *, drop = Fasle, inplace = False,
#                       col_level = 0, col_fill = '',
#                       allow_duplicates = _NoDefault.no_default,
#                       names = None)

In [3]:
np.random.seed(0)
df1 = pd.DataFrame(np.vstack([list('ABCDE'),
                              np.round(np.random.rand(3, 5), 2)]).T,
                   columns = ["C1", "C2", "C3", "C4"])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [4]:
df2 = df1.set_index("C1")
df2 # C1 컬럼을 index 컬럼으로 설정

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.55,0.65,0.79
B,0.72,0.44,0.53
C,0.6,0.89,0.57
D,0.54,0.96,0.93
E,0.42,0.38,0.07


In [5]:
df2.set_index("C2") # 다시 set_index를 하면
# 기존 index 컬럼이 사라진다.
# 즉, C1 컬럼이 없어짐

Unnamed: 0_level_0,C3,C4
C2,Unnamed: 1_level_1,Unnamed: 2_level_1
0.55,0.65,0.79
0.72,0.44,0.53
0.6,0.89,0.57
0.54,0.96,0.93
0.42,0.38,0.07


In [7]:
print(df2) # 기존 DataFrame
df2.reset_index()
# reset_index()로 index 컬럼으로 쓰인 컬럼을
# Dataframe 가장 왼쪽(선두)로 삽입된다.

# DataFrame 인덱스는 정수로 된 디폴트 인덱스로 바뀐다.

      C2    C3    C4
C1                  
A   0.55  0.65  0.79
B   0.72  0.44  0.53
C    0.6  0.89  0.57
D   0.54  0.96  0.93
E   0.42  0.38  0.07


Unnamed: 0,C1,C2,C3,C4
0,A,0.55,0.65,0.79
1,B,0.72,0.44,0.53
2,C,0.6,0.89,0.57
3,D,0.54,0.96,0.93
4,E,0.42,0.38,0.07


In [9]:
df2.reset_index(drop = True)
# drop = True를 쓰게 되면 index column을 버린다.

Unnamed: 0,C2,C3,C4
0,0.55,0.65,0.79
1,0.72,0.44,0.53
2,0.6,0.89,0.57
3,0.54,0.96,0.93
4,0.42,0.38,0.07


# DataFrame dropna() 메서드

In [10]:
# dropna() 메서드는 DataFrame 객체에서 결측치를 갖는 행에
# 대해 제거를 한다.

# 기본적인 동작은 행에 단 하나라도 결측치가 있다면 해당 행 제거
employees = pd.read_csv(
    "C:/python/datas/employees.csv", parse_dates = ["Start Date"]
)
employees

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [11]:
employees.dropna()
# 1001개의 행이 761개의 행으로 감소
# row 내부에 NaN 값이 한 개 이상 있으면 해당 행 삭제

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
