# 라이브러리 불러오기

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib as mpl

print(np.__version__)
print(pd.__version__)
print(sns.__version__)
print(mpl.__version__)

2.2.4
2.2.3
0.13.2
3.10.1


# 샘플 데이터 가져오기

In [2]:
## iris dataset
iris = sns.load_dataset("iris")
iris.head(1) # 행 하나 가져오기

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [3]:
## tips dataset
# size == table count
# smoker == 팁 준 사람이 담배 핌?
# total_bill == 영수증
tips = sns.load_dataset("tips")
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


# 결측치 확인
- 데이터가 비어있나 확인하는 것

In [4]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [5]:
iris.shape

(150, 5)

In [6]:
tips.shape

(244, 7)

In [7]:
tips['day'] # columns name

0       Sun
1       Sun
2       Sun
3       Sun
4       Sun
       ... 
239     Sat
240     Sat
241     Sat
242     Sat
243    Thur
Name: day, Length: 244, dtype: category
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [8]:
type(tips['day']) # 시리즈
# 자체적으로 형변환이 일어남.

pandas.core.series.Series

In [9]:
type(iris) # 데이터프레임

pandas.core.frame.DataFrame

In [10]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

## 상위 5개만 보기

In [11]:
# 숫자열을 sort로 내림차순 정렬
# 상위 5개만 인덱싱

# 길어짐. 이를 방지하기 위해 메서드가 있음
iris.nlargest(5, "sepal_length")


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
131,7.9,3.8,6.4,2.0,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica


#### nlargest
- 관련 메소드
  + DataFrame.nsmallest
  + DataFrame.sort_values
  + DataFrame.head

#### pandas.org
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.nlargest.html

this method is equivalent to df.soft_values(),
but more performant.

DataFrame.nlargest(n, columns, keep='first') # default

ex) df.nlargest(n, columns, keep='last') # 역순으로 출력


# 필터링

In [12]:
## tips 평균
## 평균보다 큰 데이터만 추출
## 중요
mean_tip = tips['tip'].mean()
mean_tip

# 내부적으로 수치연산하는 건 numpy로 하기 떄문에 numpy로 나옴

np.float64(2.99827868852459)

In [13]:
## numpy => a[a>12]
## 예시보다 큰 값 찾기
tips[tips['tip'] > mean_tip].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


In [14]:
tips[tips['tip'] > tips['tip'].mean()].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


smoker가 no인 것만 조회

In [22]:
tips[tips['smoker'] == 'No'].head(1)
# tips[tips['smoker'] == 'Yes']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [23]:
# Dinner
tips[tips['time'] == 'Dinner'].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [24]:
# Day
tips[tips['day'] == 'Sat'].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
19,20.65,3.35,Male,No,Sat,Dinner,3


In [25]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [27]:
# Day 개수까지 출력함으로써 교차검증
tips[tips['day'] == 'Sat'].shape

(87, 7)

In [30]:
## index 리셋
## 출력문의 인덱스가 0부터 시작하도록
## 필터링 끝나면 인덱스 체크하고 무조건 0부터 시작하도록 설정해야함.
tips[tips['day'] == 'Sat'].reset_index(drop=True)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,20.65,3.35,Male,No,Sat,Dinner,3
1,17.92,4.08,Male,No,Sat,Dinner,2
2,20.29,2.75,Female,No,Sat,Dinner,2
3,15.77,2.23,Female,No,Sat,Dinner,2
4,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
82,35.83,4.67,Female,No,Sat,Dinner,3
83,29.03,5.92,Male,No,Sat,Dinner,3
84,27.18,2.00,Female,Yes,Sat,Dinner,2
85,22.67,2.00,Male,Yes,Sat,Dinner,2


# iloc, loc

 iloc : 전처리 과정, 작은 크기의 데이터
 loc : 큰 데이터

In [34]:
## loc
## tips.loc[행, 열]

tips.loc[0, ['total_bill', 'tip', 'day']] # 열은 항상 리스트로 

total_bill    16.99
tip            1.01
day             Sun
Name: 0, dtype: object

In [33]:
tips.loc[0:1, ['total_bill', 'tip', 'day']] # 열은 항상 리스트로 

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun


In [38]:
tips.loc[0:2, ['total_bill', 'tip', 'day']] # 열은 항상 리스트로 

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun
2,21.01,3.5,Sun


In [39]:
## iloc
# 슬라이싱이지만, 마지막 번호는 참조 안한다.
tips.iloc[0:1, [0,1,4]] # 열은 해당 열의 위치

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun


In [37]:
tips.iloc[0:2, [0,1,4]]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun


In [41]:
tips.loc[tips['day'] == 'Sat', ['total_bill', 'tip', 'day']].reset_index(drop=True)

Unnamed: 0,total_bill,tip,day
0,20.65,3.35,Sat
1,17.92,4.08,Sat
2,20.29,2.75,Sat
3,15.77,2.23,Sat
4,39.42,7.58,Sat
...,...,...,...
82,35.83,4.67,Sat
83,29.03,5.92,Sat
84,27.18,2.00,Sat
85,22.67,2.00,Sat


In [43]:
#tips.loc[:,:] # 전체 조회

# tips.loc[행 조건 식,:]

In [53]:
## total_bill이 11이하인 것만 조회
tips.loc[tips['total_bill'] < 12, :].reset_index(drop=True).head(4)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,10.34,1.66,Male,No,Sun,Dinner,3
1,8.77,2.0,Male,No,Sun,Dinner,2
2,10.27,1.71,Male,No,Sun,Dinner,2
3,10.33,1.67,Female,No,Sun,Dinner,3


In [56]:
## time이 dinner인 것만 조회
tips.loc[tips['time'] == 'Dinner', :].reset_index().head(4)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size
0,0,16.99,1.01,Female,No,Sun,Dinner,2
1,1,10.34,1.66,Male,No,Sun,Dinner,3
2,2,21.01,3.5,Male,No,Sun,Dinner,3
3,3,23.68,3.31,Male,No,Sun,Dinner,2


In [74]:
## time이 Dinner and total_bill이 11이하인 것 조회
# tips.loc[tips['time'] == 'Dinner' and tips['total_bill'] < 12], :] # error

result = tips.loc[tips['time'] == 'Dinner', :]
result.loc[result['total_bill'] <= 11, :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3


In [73]:
tips.loc[(tips['time'] == 'Dinner') & (tips['total_bill'] < 12), :].reset_index().head(1)

Unnamed: 0,index,total_bill,tip,sex,smoker,day,time,size
0,1,10.34,1.66,Male,No,Sun,Dinner,3


In [85]:
## iris, 품종이 virginica 이거나 sepal_length >= 5인 것
## 컬럼은 sepal_length, petal_length, species만

iris.loc[(iris['species'] == 'virginica') | (iris['sepal_length'] >= 5), ['sepal_length', 'petal_length', 'species']].reset_index()

Unnamed: 0,index,sepal_length,petal_length,species
0,0,5.1,1.4,setosa
1,4,5.0,1.4,setosa
2,5,5.4,1.7,setosa
3,7,5.0,1.5,setosa
4,10,5.4,1.5,setosa
...,...,...,...,...
124,145,6.7,5.2,virginica
125,146,6.3,5.0,virginica
126,147,6.5,5.2,virginica
127,148,6.2,5.4,virginica


# 파일 입출력
- excel
- csv

In [88]:
import seaborn as sns
import pandas as pd

iris = sns.load_dataset("iris")
result = iris.loc[:, ['sepal_length','species']]
result

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


## csv

In [None]:
## 파일 내보내기

In [None]:
result.to_csv("iris_result.csv", index=False) # index는 내보내지 않는다.

In [90]:
result.to_csv("file_test/iris_result.csv", index=False) # folder 안으로 보내기

In [None]:
result.to_csv("dataset/iris_result.csv", index=False) # oserror. cannot save file into a non-existent directory

In [91]:
## 파일 불러오기

In [93]:
iris_df = pd.read_csv("file_test/iris_result.csv")
iris_df

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


In [None]:
## 