## Appending new rows to DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
names = pd.read_csv("../python_cookbook/data/names.csv")
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [4]:
# .loc 속성을 활용해서 새 행을 추가.
new_data_list = ["Aria", 1]
names.loc[4] = new_data_list
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [6]:
# .loc 속성을 활용할 때, 인덱스는 문자열도 지정이 가능하다.
names.loc["five"] = ["Zach", 3]
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3


In [7]:
# 추가할 행은 딕셔너리 형식으로도 가능하며, len(names)을 활용해 끝에 인덱스 숫자를 자동 지정할 수 있다.
names.loc[len(names)] = {"Name": "Zayd", "Age": 2}
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2


In [9]:
# Series 객체로도 새 행에 딕셔너리와 같은 형태로 추가가 가능하다.
names.loc[len(names)] = pd.Series({"Age": 32, "Name": "Dean"})
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2
7,Dean,32
8,Dean,32


In [12]:
# append 메서드로 딕셔너리 데이터를 행에 추가할 수 있지만, 인덱스가 RangeIndex로 대체된다.
names = pd.read_csv("../python_cookbook/data/names.csv")
names.append({"Name": "Aria", "Age": 1}, ignore_index=True)
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [13]:
names.index = ["Canada", "Canada", "USA", "USA"]
names

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2


In [18]:
# Series를 새 행에 추가할 때, name 매개변수에 지정한 값은 인덱스 값에 할당 된다.
s = pd.Series({"Name": "Zach", "Age": 3}, name=len(names))
names.append(s)

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3


In [20]:
# append 메서드는 리스트 형태로 Series를 묶어서 여러 행을 추가하는 게 가능하다.
s1 = pd.Series({"Name": "Zach", "Age": 3}, name=len(names))
s2 = pd.Series({"Name": "Zayd", "Age": 2}, name="USA")
names.append([s1, s2])

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3
USA,Zayd,2


In [21]:
bbal_16 = pd.read_csv("../python_cookbook/data/baseball16.csv")
bbal_16

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,altuvjo01,2016,1,HOU,AL,161,640,108,216,42,...,96.0,30.0,10.0,60,70.0,11.0,7.0,3.0,7.0,15.0
1,bregmal01,2016,1,HOU,AL,49,201,31,53,13,...,34.0,2.0,0.0,15,52.0,0.0,0.0,0.0,1.0,1.0
2,castrja01,2016,1,HOU,AL,113,329,41,69,16,...,32.0,2.0,1.0,45,123.0,0.0,1.0,1.0,0.0,9.0
3,correca01,2016,1,HOU,AL,153,577,76,158,36,...,96.0,13.0,3.0,75,139.0,5.0,5.0,0.0,3.0,12.0
4,gattiev01,2016,1,HOU,AL,128,447,58,112,19,...,72.0,2.0,1.0,43,127.0,6.0,4.0,0.0,5.0,12.0
5,gomezca01,2016,1,HOU,AL,85,295,27,62,16,...,29.0,13.0,2.0,21,100.0,2.0,4.0,3.0,0.0,11.0
6,gonzama01,2016,1,HOU,AL,141,484,55,123,26,...,51.0,12.0,6.0,22,118.0,1.0,5.0,6.0,1.0,16.0
7,gourryu01,2016,1,HOU,AL,36,130,13,34,7,...,15.0,1.0,1.0,5,12.0,0.0,1.0,0.0,1.0,7.0
8,kempto01,2016,1,HOU,AL,59,120,15,26,4,...,7.0,2.0,1.0,14,27.0,0.0,0.0,1.0,1.0,5.0
9,marisja01,2016,1,HOU,AL,118,287,40,60,18,...,21.0,10.0,5.0,16,83.0,0.0,3.0,4.0,1.0,4.0


In [24]:
# dataFrame의 1개 행을 추출해서 딕셔너리로 변환시킨다.
data_dict = bbal_16.iloc[0].to_dict()
data_dict

{'playerID': 'altuvjo01',
 'yearID': 2016,
 'stint': 1,
 'teamID': 'HOU',
 'lgID': 'AL',
 'G': 161,
 'AB': 640,
 'R': 108,
 'H': 216,
 '2B': 42,
 '3B': 5,
 'HR': 24,
 'RBI': 96.0,
 'SB': 30.0,
 'CS': 10.0,
 'BB': 60,
 'SO': 70.0,
 'IBB': 11.0,
 'HBP': 7.0,
 'SH': 3.0,
 'SF': 7.0,
 'GIDP': 15.0}

In [25]:
new_data_dict = {k: "" if isinstance(v, str) else np.nan for k, v in data_dict.items()}

{'playerID': '',
 'yearID': nan,
 'stint': nan,
 'teamID': '',
 'lgID': '',
 'G': nan,
 'AB': nan,
 'R': nan,
 'H': nan,
 '2B': nan,
 '3B': nan,
 'HR': nan,
 'RBI': nan,
 'SB': nan,
 'CS': nan,
 'BB': nan,
 'SO': nan,
 'IBB': nan,
 'HBP': nan,
 'SH': nan,
 'SF': nan,
 'GIDP': nan}

In [26]:
new_data_dict = {}
for k, v in data_dict.items():
    if isinstance(v, str):
        new_data_dict[k] = ""
    else:
        new_data_dict[k] = np.nan
new_data_dict

{'playerID': '',
 'yearID': nan,
 'stint': nan,
 'teamID': '',
 'lgID': '',
 'G': nan,
 'AB': nan,
 'R': nan,
 'H': nan,
 '2B': nan,
 '3B': nan,
 'HR': nan,
 'RBI': nan,
 'SB': nan,
 'CS': nan,
 'BB': nan,
 'SO': nan,
 'IBB': nan,
 'HBP': nan,
 'SH': nan,
 'SF': nan,
 'GIDP': nan}

In [29]:
random_data = []
for i in range(1000):
    d = dict()
    for k, v in data_dict.items():
        if isinstance(v, str):
            d[k] = np.random.choice(list('abcde'))
        else:
            d[k] = np.random.randint(10)
    random_data.append(pd.Series(d, name=i + len(bbal_16)))
random_data[0]

playerID    a
yearID      9
stint       7
teamID      d
lgID        c
G           1
AB          8
R           5
H           1
2B          7
3B          7
HR          7
RBI         1
SB          4
CS          4
BB          1
SO          5
IBB         4
HBP         0
SH          2
SF          3
GIDP        9
Name: 16, dtype: object

In [30]:
# 행을 추가할 경우, 반복문 활용은 비효율적이다.
%%timeit
bbal_16_copy = bbal_16.copy()
for row in random_data:
    bbal_16_copy = bbal_16_copy.append(row)

2.86 s ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
# 추가할 행 정보를 담고 있는 Series 객체들을 리스트로 묶어서 한번에 전달하는 게 효율적이다.
%%timeit
bbal_16_copy = bbal_16.copy()
bbal_16_copy = bbal_16.append(random_data)

53.2 ms ± 537 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
