# 시리즈 만들기

'리스트'를 전달해 시리즈를 만든다.

In [2]:
import pandas as pd
s = pd.Series(['banana', 42])
print(s)

0    banana
1        42
dtype: object


In [3]:
s = pd.Series(['Wes McKinney', 'Creator of Pandas'])
# 인덱스 지정해주지 않으면 자동으로 0부터 설정됨
print(s)
print()

s = pd.Series(['Wes McKinney', 'Creator of Pandas'], index = ['Person', 'Who'])
print(s) 

0         Wes McKinney
1    Creator of Pandas
dtype: object

Person         Wes McKinney
Who       Creator of Pandas
dtype: object


# 데이터프레임 만들기

'딕셔너리'를 전달해 데이터프레임을 만든다.

In [4]:
scientists = pd.DataFrame({
    'Name' : ['Rosaline Franklin', 'William Gosset'],
    'Occupation' : ['Chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-13'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37, 61]
})

print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


In [6]:
scientists = pd.DataFrame({
    'Occupation' : ['Chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-13'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37, 61]
},
    index = ['Rosaline Franklin', 'William Gosset'],
    columns = ['Occupation', 'Born', 'Age', 'Died'])

# columns 인자로 칼럼의 순서를 변경해줄 수 있음
print(scientists)

                     Occupation        Born  Age        Died
Rosaline Franklin       Chemist  1920-07-25   37  1958-04-16
William Gosset     Statistician  1876-06-13   61  1937-10-16


## 데이터의 순서를 보장하는 df (OrderedDict)

In [9]:
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([
    ('Name', ['Rosaline Franklin', 'William Gosset']),
    ('Occupation', ['Chemist', 'Statistician']),
    ('Born', ['1920-07-25', '1876-06-13']),
    ('Died', ['1958-04-16', '1937-10-16']),
    ('Age', [37, 61])
]))

print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


# 데이터프레임에서 시리즈 선택하기

df에서 시리즈 선택하기 위해 loc, iloc 이용

In [10]:
scientists = pd.DataFrame({
    'Occupation' : ['Chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-13'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37, 61]
},
    index = ['Rosaline Franklin', 'William Gosset'],
    columns = ['Occupation', 'Born', 'Age', 'Died'])

first_row = scientists.loc['William Gosset']

print(type(first_row))
print()
print(first_row)

# Age 변수 int형으로 설정해줬지만, object형으로 인식됨

<class 'pandas.core.series.Series'>

Occupation    Statistician
Born            1876-06-13
Age                     61
Died            1937-10-16
Name: William Gosset, dtype: object


# index, values, keys 사용하기

In [11]:
first_row.index

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')

In [12]:
first_row.values

array(['Statistician', '1876-06-13', 61, '1937-10-16'], dtype=object)

In [14]:
first_row.keys() # 인덱스랑 같은 역할
# values : 속성, keys : 메소드 

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')

In [15]:
first_row.index[0]

'Occupation'

In [16]:
first_row.keys()[0]

'Occupation'

# 시리즈의 mean, min, max, std 메서드 사용하기

In [17]:
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [18]:
print(ages.min())
print(ages.max())
print(ages.mean())
print(ages.std())

37
61
49.0
16.97056274847714


In [42]:
# 이외의 메소드

print(ages.drop_duplicates()) # 중복값이 없는 시리즈 반환

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [41]:
print(ages.equals(50)) # 시리즈에 해당 값을 가진 요소가 있는지 확인

False


In [43]:
print(ages.isin(['a', 1, 50, 37])) # 시리즈에 포함된 값이 있는지 확인

Rosaline Franklin     True
William Gosset       False
Name: Age, dtype: bool


In [44]:
print(ages.replace(37, 'replaced')) # 대체할 데이터를 찾아 대체

Rosaline Franklin    replaced
William Gosset             61
Name: Age, dtype: object


In [57]:
print(ages.sample()) # 임의의 값을 반환

William Gosset    61
Name: Age, dtype: int64


In [50]:
print(ages.sort_values()) # 값 정렬

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [48]:
ages_df = ages.to_frame() # 시리즈를 데이터프레임으로 변환
print(ages_df)
print()
print(type(ages_df))

                   Age
Rosaline Franklin   37
William Gosset      61

<class 'pandas.core.frame.DataFrame'>


# 시리즈와 불린 추출 사용하기

In [15]:
scientists = pd.read_csv('../data/scientists.csv')

ages = scientists['Age']
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [65]:
print(ages.max())
print()
print(ages.mean())

90

59.125


In [60]:
# 불린 추출

ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

# 시리즈와 브로드캐스팅

벡터와 벡터의 연산은 일치하는 인덱스의 값끼리 수행된다.

In [68]:
print(ages + ages)
print()
print(ages * ages)
print()
print(ages + 100)
print()
print(ages * 2)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [70]:
# 길이가 다른 벡터 연산 

print(ages + pd.Series([1, 100]))
# 둘 다 존재하는 인덱스에 맞춰 계산

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


In [71]:
rev_ages = ages.sort_index(ascending = False) # 인덱스 역순으로 정렬
print(rev_ages)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


In [72]:
print(ages + rev_ages)
print()
print(ages * 2)

# 인덱스가 같은 값끼리 계산하기 때문에 ages의 2배와 값이 같음

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


# 데이터프레임과 불린 추출

In [5]:
scientists[scientists['Age'] > scientists['Age'].mean()]

Unnamed: 0,Name,Occupation,Born,Died,Age
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


# 데이터프레임과 브로드캐스팅

In [6]:
scientists * 2 # 숫자 데이터는 2배, 문자열은 2배로 늘어남

Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline FranklinRosaline Franklin,ChemistChemist,1920-07-251920-07-25,1958-04-161958-04-16,74
1,William GossetWilliam Gosset,StatisticianStatistician,1876-06-131876-06-13,1937-10-161937-10-16,122


# 열의 자료형 바꾸기와 새로운 열 추가하기

In [7]:
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)

object
object


pd.to_datetime 이용해 object 타입을 datetime 타입으로 변경해줌

In [8]:
born_datetime = pd.to_datetime(scientists['Born'])
born_datetime

0   1920-07-25
1   1876-06-13
Name: Born, dtype: datetime64[ns]

In [9]:
died_datetime = pd.to_datetime(scientists['Died'])
died_datetime

0   1958-04-16
1   1937-10-16
Name: Died, dtype: datetime64[ns]

In [10]:
scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime) 
scientists.head()

Unnamed: 0,Name,Occupation,Born,Died,Age,born_dt,died_dt
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37,1920-07-25,1958-04-16
1,William Gosset,Statistician,1876-06-13,1937-10-16,61,1876-06-13,1937-10-16


In [11]:
scientists['age_days_dt'] = scientists['died_dt'] - scientists['born_dt']
scientists.head()

Unnamed: 0,Name,Occupation,Born,Died,Age,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37,1920-07-25,1958-04-16,13779 days
1,William Gosset,Statistician,1876-06-13,1937-10-16,61,1876-06-13,1937-10-16,22404 days


# 시리즈, 데이터프레임의 데이터 섞어보기

In [17]:
scientists = pd.read_csv('../data/scientists.csv')

In [18]:
scientists['Age']

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [19]:
# Age 열의 데이터를 섞기 위해 random 라이브러리의 shuffle 메서드 사용

import random
import warnings
warnings.filterwarnings('ignore')

random.seed(42)
random.shuffle(scientists['Age'])
scientists['Age']

0    66
1    56
2    41
3    77
4    90
5    45
6    37
7    61
Name: Age, dtype: int64

# 데이터프레임의 열 삭제하기

In [20]:
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')

In [21]:
# 열 삭제하기 위해 drop 메서드 사용
scientists_dropped = scientists.drop('Age', axis = 1)
scientists_dropped.columns

Index(['Name', 'Born', 'Died', 'Occupation'], dtype='object')

# 피클 형식으로 저장하기


- 피클은 데이터를 바이너리 형태로 직렬화한 오브젝트를 저장하는 방법
- 피클로 저장하면 스프레드시트보다 더 작은 용량으로 데이터를 저장할 수 있다.

In [23]:
# 피클로 저장하기 위해서는 to_pickle 메서드 사용

names = scientists['Name']
names.to_pickle('../myoutput/scientists_names_series.pickle')

In [24]:
# 데이터프레임도 피클로 저장 가능

scientists.to_pickle('../myoutput/scientists_df.pickle')

In [26]:
# 파일 불러오기 위해서는 read_pickle 이용 

scientist_names_from_pickle = pd.read_pickle('../myoutput/scientists_names_series.pickle')
scientist_names_from_pickle

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [27]:
scientists_from_pickle = pd.read_pickle('../myoutput/scientists_df.pickle')
scientists_from_pickle

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,66,Chemist
1,William Gosset,1876-06-13,1937-10-16,56,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,41,Nurse
3,Marie Curie,1867-11-07,1934-07-04,77,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,90,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,37,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,61,Mathematician


# CSV 불러오기

In [28]:
# csv 파일과 tsv 파일로 저장하기

names.to_csv('../myoutput/scientists_names_series.csv')
scientists.to_csv('../myoutput/scientists_df.tsv', sep = '\t')

## 엑셀 파일로 저장하기

In [30]:
import xlwt
import openpyxl

In [32]:
names_df = names.to_frame()

names_df.to_excel('../myoutput/scientists_names_series_df.xls')
names_df.to_excel('../myoutput/scientists_names_series_df.xlsx')