# 라이브러리 불러오기

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl

print(np.__version__)
print(pd.__version__)
print(sns.__version__)
print(mpl.__version__)

2.2.4
2.2.3
0.13.2
3.10.1


In [4]:
# 샘플 데이터 가져오기

In [5]:
iris = sns.load_dataset("iris")
iris.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [6]:
tips = sns.load_dataset("tips")
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


# 결측치 확인
- 데이터가 비어 있나?

In [7]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [8]:
iris.shape

(150, 5)

In [9]:
tips.shape

(244, 7)

In [10]:
a = tips['day']
type(a)

pandas.core.series.Series

In [11]:
type(tips)

pandas.core.frame.DataFrame

In [12]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

# 상위 5개만 보기
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.nlargest.html
- 관련 메서드
      + DataFrame.nsmallest
      + DataFrame.sort_values
      + DataFrame.head

In [13]:
# 숫자 열을 sort() 내림차순 정렬
# 상위 5개만 인덱싱
iris.nlargest(5, "sepal_length")
# This method is equivalent to df.sort_values(columns, ascending=False).head(n), 
# but more performant.
# iris.sort_values("sepal_length", ascending=False).head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
131,7.9,3.8,6.4,2.0,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica


In [14]:
# 하위 5개만 인덱싱
iris.nsmallest(5, "sepal_length")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
8,4.4,2.9,1.4,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
42,4.4,3.2,1.3,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa


# 필터링
- Numpy와 문법 동일
- (pandas 수치 연산은 numpy 기반)

In [15]:
# tips의 평균 구하기
# 평균보다 큰 데이터만 조회
mean_tip = tips['tip'].mean()
# numpy 문법 ==> a[a>12]
tips[tips['tip'] > mean_tip]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
232,11.61,3.39,Male,No,Sat,Dinner,2
234,15.53,3.00,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3


In [16]:
tips[tips['tip'] > tips['tip'].mean()].head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4


- smoker가 No인 것만 조회

In [27]:
tips[tips['smoker'] == 'No']

tips[tips['time'] == 'Dinner']

tips[tips['day'] == 'Sat'] # 인덱스 정렬 안됨

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
19,20.65,3.35,Male,No,Sat,Dinner,3
20,17.92,4.08,Male,No,Sat,Dinner,2
21,20.29,2.75,Female,No,Sat,Dinner,2
22,15.77,2.23,Female,No,Sat,Dinner,2
23,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [28]:
# 인덱스 번호를 0부터 재정렬
tips[tips['day'] == 'Sat'].reset_index(drop=True)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,20.65,3.35,Male,No,Sat,Dinner,3
1,17.92,4.08,Male,No,Sat,Dinner,2
2,20.29,2.75,Female,No,Sat,Dinner,2
3,15.77,2.23,Female,No,Sat,Dinner,2
4,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
82,35.83,4.67,Female,No,Sat,Dinner,3
83,29.03,5.92,Male,No,Sat,Dinner,3
84,27.18,2.00,Female,Yes,Sat,Dinner,2
85,22.67,2.00,Male,Yes,Sat,Dinner,2


# loc vs iloc
- 코드비교

## loc

In [34]:
# tips.loc[tips['day'] == 'Sat']
# tips.loc[행, 열]
tips.loc[0:1, ['total_bill', 'tip', 'day']]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun


In [33]:
# iloc
tips.iloc[0:1, [0, 1, 4]]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun


In [39]:
tips.loc[tips['day'] == 'Sat', ['total_bill', 'tip', 'day']].reset_index()

Unnamed: 0,index,total_bill,tip,day
0,19,20.65,3.35,Sat
1,20,17.92,4.08,Sat
2,21,20.29,2.75,Sat
3,22,15.77,2.23,Sat
4,23,39.42,7.58,Sat
...,...,...,...,...
82,238,35.83,4.67,Sat
83,239,29.03,5.92,Sat
84,240,27.18,2.00,Sat
85,241,22.67,2.00,Sat


In [44]:
# tips.loc[행 조건식만 만들자!, : ]

tips.loc[tips['total_bill'] <= 11, : ].reset_index().head(5)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size
0,1,10.34,1.66,Male,No,Sun,Dinner,3
1,6,8.77,2.0,Male,No,Sun,Dinner,2
2,10,10.27,1.71,Male,No,Sun,Dinner,2
3,16,10.33,1.67,Female,No,Sun,Dinner,3
4,30,9.55,1.45,Male,No,Sat,Dinner,2


In [45]:
# time 이 dinner인 것만 조회, loc 적용
tips.loc[tips['time'] == 'Dinner', : ].reset_index().head(5)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size
0,0,16.99,1.01,Female,No,Sun,Dinner,2
1,1,10.34,1.66,Male,No,Sun,Dinner,3
2,2,21.01,3.5,Male,No,Sun,Dinner,3
3,3,23.68,3.31,Male,No,Sun,Dinner,2
4,4,24.59,3.61,Female,No,Sun,Dinner,4


In [67]:
# time이 dinner이면서 total_bill이 11 이하인 것만 조회
tips.loc[tips['total_bill'] <= 11, : ].loc[tips['time'] == 'Dinner', : ]
tips.loc[(tips['time'] == 'Dinner') & (tips['total_bill'] <= 11), :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.0,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
51,10.29,2.6,Female,No,Sun,Dinner,2
53,9.94,1.56,Male,No,Sun,Dinner,2
67,3.07,1.0,Female,Yes,Sat,Dinner,1
75,10.51,1.25,Male,No,Sat,Dinner,2


In [71]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [75]:
# iris
# 품종이 virginica 이거나 sepal_length가 >= 5인 값만 가져오고
# 칼럼은 sepal_length, petal_length, species만 가져오세요
# Loc 기반으로

iris.loc[(iris['species'] == 'virginica') | (iris['sepal_length'] >= 5), \
    ['sepal_length', 'petal_length', 'species']]

Unnamed: 0,sepal_length,petal_length,species
0,5.1,1.4,setosa
4,5.0,1.4,setosa
5,5.4,1.7,setosa
7,5.0,1.5,setosa
10,5.4,1.5,setosa
...,...,...,...
145,6.7,5.2,virginica
146,6.3,5.0,virginica
147,6.5,5.2,virginica
148,6.2,5.4,virginica


# 파일 입출력
- csv
- excel

In [5]:
import seaborn as sns
import pandas as pd

iris = sns.load_dataset("iris")
result = iris.loc[ : , ['sepal_length', 'species']]
result

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


# CSV

In [10]:
# 파일 내보내기
result.to_csv("dataset/iris_result.csv", index = False)

In [11]:
# 파일 불러오기
iris_df = pd.read_csv("dataset/iris_result.csv")
iris_df

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica
