In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
%%time

nfl = pd.read_csv("C:/python/datas/nfl.csv", parse_dates = ["Birthday"], index_col = ["Name"])
print(nfl.reset_index().set_index("Team").loc["New York Jets"].nsmallest(1, "Birthday"))

                     Name Position   Birthday   Salary
Team                                                  
New York Jets  Ryan Kalil        C 1985-03-29  2400000
Wall time: 83.9 ms


In [3]:
%%time
print(nfl.reset_index().sort_values("Team").reset_index(drop = True).set_index("Team").iloc[1190:1248].nsmallest(1, "Birthday"))

                     Name Position   Birthday   Salary
Team                                                  
New York Jets  Ryan Kalil        C 1985-03-29  2400000
Wall time: 20.3 ms


In [4]:
data1 = {
    "이름" : ["일식", "이식", "삼식", "사식", "오식"],
    "국어" : [60, 70, 90, 80, 100],
    "영어" : [70, 86, 82, 88, 100],
    "수학" : [65, 82, 85, 90, 100]
}

df1 = pd.DataFrame(data1).set_index("이름")
print(df1) # 이름이 index 컬럼인 DataFrame 생성

df1 = df1.reset_index()
df1 # "이름" index 컬럼을 다시 DataFrame으로 반환

     국어   영어   수학
이름               
일식   60   70   65
이식   70   86   82
삼식   90   82   85
사식   80   88   90
오식  100  100  100


Unnamed: 0,이름,국어,영어,수학
0,일식,60,70,65
1,이식,70,86,82
2,삼식,90,82,85
3,사식,80,88,90
4,오식,100,100,100


In [5]:
# 타이타닉 데이터셋에서 alive라는 컬럼값이 'no'이면 False로, 
# 'yes'라면 True로 변경

# - change_boolean(value)함수를 만든다. 이 함수는 값이 'yes'이면 True반환,
#   'no'이면 False 반환
# - 만든 함수를 활용해서 타이타닉호 데이터셋 중 alive라는 column의 값을
#   'no'면 False로 'yes'면 True로 치환해서 boolean 타입으로 변경

In [6]:
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
def change_boolean(value):
    if value == 'yes':
        return True
    else:
        return False
# boolean dtype으로 변경하는 함수

titanic.alive = titanic.alive.apply(change_boolean)
# alive 컬럼에 change_boolean 함수 적용
titanic.dtypes # 컬럼별 데이터타입 조회

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive              bool
alone              bool
dtype: object

In [8]:
# age에서 NaN인 값을 제거
# adult_female column을 만들고 나이(age)가 20이상이고 성별(sex)이
# 'female'인 값은 True를 아니면 False 값 할당

In [9]:
titanic = titanic[titanic.age.notnull()] # 결측치인 행 제거
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,True,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,True,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,True,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,False,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,False,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,True,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,True,True


In [10]:
titanic["adult_female"] = (titanic.sex == "female") & (titanic.age >= 20)
titanic # 성별이 여자이면서, 20살 이상인 사람에게만 True

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult_female
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,False,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,True,False,True
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,True,True,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,True,False,True
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,False,False,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,False,True,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,True,True,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,True,True,False


In [11]:
# 작업함 titanic 파일을 현재 경로에 titanic_age.csv 파일로 index 없이 저장
# 저장한 파일이 올바른지 다시 불러와서 그 값을 확인

# 단, index의 변경은 신경쓰지 않는다.

In [12]:
%pwd

'C:\\python\\2023.03.10_pandas5'

In [13]:
titanic.to_csv("titanic_age.csv", index = False)

In [14]:
pd.read_csv("titanic_age.csv")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult_female
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,False,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,True,False,True
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,True,True,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,True,False,True
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,False,False,True
710,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,False,True,False
711,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,True,True,False
712,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,True,True,False


In [15]:
# 타이티닉 데이터셋 새로 불러온 후에 티켓 클래스(pclass)에 따른
# 생존율 비교

# survived가 1이면 생존, 0이면 사망
# groupby() 메서드 사용

titanic = sns.load_dataset("titanic")
titanic[["pclass", "survived"]].groupby(["pclass"]).mean()
# mean을 통해 생존율 계산

Unnamed: 0_level_0,survived
pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [16]:
# 타이타닉호 데이터셋을 새로 불러온 후에 성별(sex)에 따른 생존율 비교

titanic = sns.load_dataset("titanic")
titanic[["sex", "survived"]].groupby(["sex"]).mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908
