# Pandas Recap

# 1. Pandas DataFrame Basics
***

## 1.1 데이터 집합 불러오기

In [1]:
import pandas as pd
df = pd.read_csv('../data/gapminder.tsv', sep='\t')

In [2]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


## 1.2 열단위 데이터 추출

### df에서 'country'열을 추출하여 country_sr 변수에 저장하기. (country_sr의 타입은 Series 가 되어야 한다)

In [12]:
country_sr = df['country']
country_sr
type(country_sr)

pandas.core.series.Series

### 'year'열과 'pop'열을 동시에 추출하여 year_pop_df 변수에 저장하기. (year_pop_df의 타입은 DataFrame이 되어야 한다)

In [14]:
year_pop_df = df[['year', 'pop']]
year_pop_df
type(year_pop_df)

pandas.core.frame.DataFrame

### 'year'열을 추출하여 year_df 변수에 저장하기. (year_df의 타입은 DataFrame이 되어야 한다)

In [15]:
year_df = df[['year']]
year_df

Unnamed: 0,year
0,1952
1,1957
2,1962
3,1967
4,1972
...,...
1699,1987
1700,1992
1701,1997
1702,2002


## 1.3 행단위 데이터 추출

### df에서 2번째 행 데이터 추출하기 (loc 속성 이용)

In [46]:
print(df.head())

print(df.loc[1])

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106
country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object


### df에서 2번째 행 데이터 추출하기 (iloc 속성 이용)

In [45]:
print(df.iloc[1])

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object


### df에서 마지막 행 데이터 추출하기 (loc 속성 이용)

In [49]:
num_rows = df.shape[0] - 1
print(df.loc[num_rows])

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object


### df에서 마지막 행 데이터 추출하기 (iloc 속성 이용)

In [50]:
print(df.iloc[-1])

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object


## 1.4 loc, iloc 속성 자유자재로 사용하기 - [ [ 행 ], [ 열 ] ]

### 'country', 'year', 'gdpPercap' 열에 대해 모든 행 데이터 추출하기(loc 사용)

In [53]:
print(df.loc[:, ['country', 'year', 'gdpPercap']])

          country  year   gdpPercap
0     Afghanistan  1952  779.445314
1     Afghanistan  1957  820.853030
2     Afghanistan  1962  853.100710
3     Afghanistan  1967  836.197138
4     Afghanistan  1972  739.981106
...           ...   ...         ...
1699     Zimbabwe  1987  706.157306
1700     Zimbabwe  1992  693.420786
1701     Zimbabwe  1997  792.449960
1702     Zimbabwe  2002  672.038623
1703     Zimbabwe  2007  469.709298

[1704 rows x 3 columns]


### 'country', 'year', 'gdpPercap' 열에 대해 처음부터 100개 행 데이터 추출하기(loc 사용)

### 'country', 'year', 'gdpPercap' 열에 대해 처음부터 100개 행 데이터 추출하기(iloc 사용)

# 2. Pandas Data Structures
***

## 2.1 데이터 만들기

In [3]:
import pandas as pd

In [60]:
scientists = pd.DataFrame(
    data={'Occupation': ['Chemist', 'Statistician'], 
          'Born': ['1920-07-25', '1876-06-13'], 
          'Died': ['1958-04-16', '1937-10-16'],
          'Age': [37, 61]},
    index=['Rosaline Franklin', 'William Gosset'],
    columns=['Occupation', 'Born', 'Died', 'Age'])

print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


## 2.2 데이터 추출

### 'William Gosset' 행 데이터 추출하기 (loc 사용)

In [63]:
print(scientists.loc['William Gosset'])

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


### 'William Gosset' 행 데이터 추출하기 (iloc 사용)

In [61]:
print(scientists.iloc[1])

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


### 'William Gosset'의 Occupation 추출 (loc 사용)

In [65]:
print(scientists.loc['William Gosset', 'Occupation'])

Statistician


### 'Rosaline Franklin'의 Occupation을 Programmer 로 변경(iloc 사용)

In [73]:
scientists.iloc[0, 0] = 'Programmer'
scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Programmer,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


## 2.3 index, columns, values 속성 사용하기

### scientists의 index 가져오기

In [74]:
scientists.index

Index(['Rosaline Franklin', 'William Gosset'], dtype='object')

### scientists의 columns 가져오기

In [75]:
scientists.columns

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

### scientists의 모든 values 가져오기

In [76]:
scientists.values

array([['Programmer', '1920-07-25', '1958-04-16', 37],
       ['Statistician', '1876-06-13', '1937-10-16', 61]], dtype=object)

## 2.4 시리즈의 기초 통계 메서드 사용하기

### 시리즈 메서드 정리

| 시리즈 메서드    | 설명 |
| ----------------| --- |
| append          | 2개 이상의 시리즈 연결 |
| describe        | 요약 통계량 계산 |
| drop_duplicates | 중복값이 없는 시리즈 반환 |
| equals          | 시리즈에 해당 값을 가진 요소가 있는지 확인 |
| get_values      | 시리즈 값 구하기 (values 속성과 동일) |
| isin            | 시리즈에 포함된 값이 있는지 확인 |
| min             | 최솟값 반환 |
| max             | 최댓값 반환 |
| mean            | 산술 평균 반환 |
| median          | 중간값 반환 |
| replace         | 특정 값을 가진 시리즈 값을 교체 |
| sample          | 시리즈에서 임의의 값을 반환 |
| sort_values     | 값을 정렬 |
| to_frame        | 시리즈를 데이터프레임으로 변환 |

### scientiest 에서 평균 나이 ('Age'열) 구하기

In [83]:
print(scientists)

scientists['Age'].mean()


                     Occupation        Born        Died  Age
Rosaline Franklin    Programmer  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


49.0

### scientiest 에서 최대 나이 ('Age'열) 구하기

In [84]:
scientists['Age'].max()

61

### scientiest 에서 최소 나이 ('Age'열) 구하기

In [85]:
scientists['Age'].min()

37

### scientiest 에서 나이('Age'열)에 대한 표준편차 구하기

In [86]:
scientists['Age'].std()

16.97056274847714

## 2.5 시리즈 다루기 - 응용

In [4]:
scientists = pd.read_csv('../data/scientists.csv')
print(scientists)

                   Name        Born        Died  Age          Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
1        William Gosset  1876-06-13  1937-10-16   61        Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5             John Snow  1813-03-15  1858-06-16   45           Physician
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician


### 수명(Age)이 평균수명 보다 적은 과학자들의 수명 데이터(시리즈) 추출하기

In [10]:
ages = scientists['Age']
print(ages.mean() < ages)

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


### 수명(Age)이 평균수명 보다 큰 과학자들의 이름(Name) 데이터(시리즈) 추출하기

In [14]:
ages = scientists['Age']
print(ages[ages.mean() < ages])

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


### 1800년대에 태어난 과학자들의 이름(Name) 데이터(시리즈) 추출하기 (& 연산자 사용)

In [17]:
names = scientists['Name']
borns = scientists['Born']

# 1800-01-01 <= x < 1900-01-01
print(names[(borns >= '1800-01-01') & (borns < '1900-01-01')])

1          William Gosset
2    Florence Nightingale
3             Marie Curie
5               John Snow
Name: Name, dtype: object


### 직업(Occupation)이 Chemist 또는 Physician 인 과학자들의 이름(Name) 데이터(시리즈) 추출하기 (| 연산자 사용)

In [24]:
scientists

chemist_names = scientists['Occupation']
physician_names = scientists['Occupation']

print(scientists[(chemist_names == 'Chemist') | (physician_names == 'Physician')])

                Name        Born        Died  Age Occupation
0  Rosaline Franklin  1920-07-25  1958-04-16   37    Chemist
3        Marie Curie  1867-11-07  1934-07-04   66    Chemist
5          John Snow  1813-03-15  1858-06-16   45  Physician


### 수명 데이터에 10 더하기

In [27]:
scientists

scientists['Age'] + 10

0     47
1     71
2    100
3     76
4     66
5     55
6     51
7     87
Name: Age, dtype: int64

## 2.6 데이터프레임 다루기

### 수명(Age)이 평균수명 보다 적은 과학자들의 전체 데이터(데이터프레임) 추출하기

In [38]:
ages = scientists['Age']
ages_mean = ages.mean()

print(scientists[ages < ages_mean])

                Name        Born        Died  Age          Occupation
0  Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
4      Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5          John Snow  1813-03-15  1858-06-16   45           Physician
6        Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist


### 출생일(Born) 데이터의 타입을 datetime 으로 변환하여 새로운 열(Born_dt)로 추가하기 (to_datetime 사용)

In [53]:
scientists['Born_dt'] = pd.to_datetime(scientists['Born'], format='%y-%m-%d')
print(scientists)

                   Name                Born        Died  Age   
0     Rosaline Franklin 1920-01-25 00:07:00  1958-04-16   37  \
1        William Gosset 1876-01-13 00:06:00  1937-10-16   61   
2  Florence Nightingale 1820-01-12 00:05:00  1910-08-13   90   
3           Marie Curie 1867-01-07 00:11:00  1934-07-04   66   
4         Rachel Carson 1907-01-27 00:05:00  1964-04-14   56   
5             John Snow 1813-01-15 00:03:00  1858-06-16   45   
6           Alan Turing 1912-01-23 00:06:00  1954-06-07   41   
7          Johann Gauss 1777-01-30 00:04:00  1855-02-23   77   

           Occupation             Born_dt  
0             Chemist 1920-01-25 00:07:00  
1        Statistician 1876-01-13 00:06:00  
2               Nurse 1820-01-12 00:05:00  
3             Chemist 1867-01-07 00:11:00  
4           Biologist 1907-01-27 00:05:00  
5           Physician 1813-01-15 00:03:00  
6  Computer Scientist 1912-01-23 00:06:00  
7       Mathematician 1777-01-30 00:04:00  


### 출생일(Born_dt) 데이터를 사용하여 1850년 이후에 태어난 과학자의 전체데이터(데이터프레임) 추출하기

In [54]:
# scientists['Born_dt']

print(scientists[scientists['Born_dt'] > '1850-01-01'])

                Name                Born        Died  Age          Occupation   
0  Rosaline Franklin 1920-01-25 00:07:00  1958-04-16   37             Chemist  \
1     William Gosset 1876-01-13 00:06:00  1937-10-16   61        Statistician   
3        Marie Curie 1867-01-07 00:11:00  1934-07-04   66             Chemist   
4      Rachel Carson 1907-01-27 00:05:00  1964-04-14   56           Biologist   
6        Alan Turing 1912-01-23 00:06:00  1954-06-07   41  Computer Scientist   

              Born_dt  
0 1920-01-25 00:07:00  
1 1876-01-13 00:06:00  
3 1867-01-07 00:11:00  
4 1907-01-27 00:05:00  
6 1912-01-23 00:06:00  


### Born_dt 열 삭제하기 (drop 함수 사용)

In [55]:
scientists.drop('Born_dt', axis=1, inplace=True)

print(scientists)

                   Name                Born        Died  Age   
0     Rosaline Franklin 1920-01-25 00:07:00  1958-04-16   37  \
1        William Gosset 1876-01-13 00:06:00  1937-10-16   61   
2  Florence Nightingale 1820-01-12 00:05:00  1910-08-13   90   
3           Marie Curie 1867-01-07 00:11:00  1934-07-04   66   
4         Rachel Carson 1907-01-27 00:05:00  1964-04-14   56   
5             John Snow 1813-01-15 00:03:00  1858-06-16   45   
6           Alan Turing 1912-01-23 00:06:00  1954-06-07   41   
7          Johann Gauss 1777-01-30 00:04:00  1855-02-23   77   

           Occupation  
0             Chemist  
1        Statistician  
2               Nurse  
3             Chemist  
4           Biologist  
5           Physician  
6  Computer Scientist  
7       Mathematician  


# 3. Data Assembly, Tidy Data, Data Types and Groupby
***

## 3.1 데이터 불러오기

In [57]:
import pandas as pd

In [58]:
scores_1_1 = pd.read_csv('../data/scores_1_1.csv', header=None)
scores_1_2 = pd.read_csv('../data/scores_1_2.csv', header=None)
scores_2_1 = pd.read_csv('../data/scores_2_1.csv', header=None)
scores_2_2 = pd.read_csv('../data/scores_2_2.csv', header=None)

## 3.2 데이터 합치기

### scores_1_1 와 scores_1_2 를 열기준(axis=1) 으로 합쳐 scores_1 변수에 저장하기 (concat)

In [59]:
scores_1 = pd.concat([scores_1_1, scores_1_2], axis=1, ignore_index=True)
print(scores_1)

     0   1   2    3    4   5   6   7    8   9   ...  15  16   17  18  19  20   
0   NaN   0   0  0.0  1.0   0   0   1  0.0   0  ...   0   0  0.0   1   0   1  \
1   1.0   1   1  0.0  1.0   0   1   1  1.0   0  ...   0   0  0.0   1   1   0   
2   0.0   0   1  1.0  1.0   1   0   1  0.0   0  ...   0   0  0.0   0   0   0   
3   1.0   1   1  1.0  1.0   1   1   1  1.0   0  ...   1   1  1.0   1   1   0   
4   1.0   1   1  0.0  1.0   0   0   1  1.0   0  ...   1   0  0.0   0   1   1   
5   1.0   1   0  1.0  1.0   1   0   1  1.0   0  ...   1   0  1.0   1   1   1   
6   1.0   1   0  1.0  0.0   1   1   1  1.0   0  ...   0   0  0.0   0   1   0   
7   1.0   1   0  0.0  1.0   0   0   1  0.0   0  ...   1   0  0.0   1   1   0   
8   1.0   0   1  1.0  1.0   0   0   1  1.0   0  ...   1   0  0.0   1   1   0   
9   1.0   0   1  0.0  1.0   0   0   0  0.0   0  ...   0   0  1.0   0   0   1   
10  1.0   0   0  0.0  1.0   0   0   1  0.0   0  ...   1   1  1.0   1   0   0   
11  1.0   1   0  0.0  1.0   0   0   0  1

### scores_1 의 열이름을 1, 2, 3, ... 로 변경하기

In [61]:
print(scores_1)

     1   2   3    4    5   6   7   8    9   10  ...  16  17   18  19  20  21   
0   NaN   0   0  0.0  1.0   0   0   1  0.0   0  ...   0   0  0.0   1   0   1  \
1   1.0   1   1  0.0  1.0   0   1   1  1.0   0  ...   0   0  0.0   1   1   0   
2   0.0   0   1  1.0  1.0   1   0   1  0.0   0  ...   0   0  0.0   0   0   0   
3   1.0   1   1  1.0  1.0   1   1   1  1.0   0  ...   1   1  1.0   1   1   0   
4   1.0   1   1  0.0  1.0   0   0   1  1.0   0  ...   1   0  0.0   0   1   1   
5   1.0   1   0  1.0  1.0   1   0   1  1.0   0  ...   1   0  1.0   1   1   1   
6   1.0   1   0  1.0  0.0   1   1   1  1.0   0  ...   0   0  0.0   0   1   0   
7   1.0   1   0  0.0  1.0   0   0   1  0.0   0  ...   1   0  0.0   1   1   0   
8   1.0   0   1  1.0  1.0   0   0   1  1.0   0  ...   1   0  0.0   1   1   0   
9   1.0   0   1  0.0  1.0   0   0   0  0.0   0  ...   0   0  1.0   0   0   1   
10  1.0   0   0  0.0  1.0   0   0   1  0.0   0  ...   1   1  1.0   1   0   0   
11  1.0   1   0  0.0  1.0   0   0   0  1

### scores_1 에서 누락 데이터를 0 으로 채우기 (fillna)

In [62]:
scores_1.fillna(0, inplace=True)
print(scores_1.head())

    1   2   3    4    5   6   7   8    9   10  ...  16  17   18  19  20  21   
0  0.0   0   0  0.0  1.0   0   0   1  0.0   0  ...   0   0  0.0   1   0   1  \
1  1.0   1   1  0.0  1.0   0   1   1  1.0   0  ...   0   0  0.0   1   1   0   
2  0.0   0   1  1.0  1.0   1   0   1  0.0   0  ...   0   0  0.0   0   0   0   
3  1.0   1   1  1.0  1.0   1   1   1  1.0   0  ...   1   1  1.0   1   1   0   
4  1.0   1   1  0.0  1.0   0   0   1  1.0   0  ...   1   0  0.0   0   1   1   

    22   23   24  25  
0  1.0  1.0  0.0   1  
1  1.0  1.0  0.0   1  
2  0.0  0.0  0.0   1  
3  1.0  1.0  1.0   1  
4  1.0  1.0  1.0   1  

[5 rows x 25 columns]


### scores_1 모든 데이터타입을 int32 으로 변경하기 (astype)

In [63]:
scores_1 = scores_1.astype('int32')

### scores_1 의 모든 데이터에 4를 곱한 값으로 재지정 하기 (브로드캐스팅)

In [65]:
scores_1 = scores_1 * 4
print(scores_1)

    1   2   3   4   5   6   7   8   9   10  ...  16  17  18  19  20  21  22   
0    0   0   0   0  16   0   0  16   0   0  ...   0   0   0  16   0  16  16  \
1   16  16  16   0  16   0  16  16  16   0  ...   0   0   0  16  16   0  16   
2    0   0  16  16  16  16   0  16   0   0  ...   0   0   0   0   0   0   0   
3   16  16  16  16  16  16  16  16  16   0  ...  16  16  16  16  16   0  16   
4   16  16  16   0  16   0   0  16  16   0  ...  16   0   0   0  16  16  16   
5   16  16   0  16  16  16   0  16  16   0  ...  16   0  16  16  16  16  16   
6   16  16   0  16   0  16  16  16  16   0  ...   0   0   0   0  16   0   0   
7   16  16   0   0  16   0   0  16   0   0  ...  16   0   0  16  16   0  16   
8   16   0  16  16  16   0   0  16  16   0  ...  16   0   0  16  16   0  16   
9   16   0  16   0  16   0   0   0   0   0  ...   0   0  16   0   0  16  16   
10  16   0   0   0  16   0   0  16   0   0  ...  16  16  16  16   0   0  16   
11  16  16   0   0  16   0   0   0  16   0  ...  16 

### scores_1 에 새로운 class 열 추가하기(class 데이터 값은 모두 '1반')

In [68]:
scores_1['class'] = '1반'

### scores_2_1 와 scores_2_2 를 열기준(axis=1) 으로 합쳐 scores_2 변수에 저장하기 (concat)

In [71]:
scores_2 = pd.concat([scores_2_1, scores_2_2], axis=True, ignore_index=True)
print(scores_2)

    0    1   2   3   4   5    6   7   8   9   ...  15  16  17  18  19  20  21   
0    1  0.0   1   0   1   0  1.0   1   1   0  ...   0   1   0   1   1   1   0  \
1    1  1.0   1   0   1   0  1.0   1   1   1  ...   1   1   1   1   1   0   1   
2    0  1.0   1   0   1   0  1.0   1   1   0  ...   1   0   1   0   0   1   0   
3    1  0.0   0   1   1   1  1.0   0   0   0  ...   1   0   0   1   0   0   0   
4    1  1.0   0   1   1   1  0.0   1   1   0  ...   0   0   0   0   0   1   0   
5    0  0.0   0   1   1   1  NaN   0   1   0  ...   0   0   0   0   0   1   0   
6    1  1.0   0   0   1   0  1.0   0   1   0  ...   1   0   0   0   0   1   1   
7    1  1.0   1   1   1   1  0.0   0   1   0  ...   1   0   0   0   1   1   0   
8    1  1.0   0   1   1   0  0.0   1   1   0  ...   1   0   0   1   0   1   1   
9    1  1.0   0   1   1   1  1.0   1   1   0  ...   1   0   0   1   1   0   1   
10   1  1.0   1   0   1   0  1.0   1   1   0  ...   1   0   0   1   1   1   1   
11   1  0.0   0   1   1   1 

### scores_2 의 열이름을 1, 2, 3, ... 로 변경하기 (range)

In [74]:
scores_2.columns = range(1, len(scores_2.columns) + 1)
print(scores_2)

    1    2   3   4   5   6    7   8   9   10  ...  16  17  18  19  20  21  22   
0    1  0.0   1   0   1   0  1.0   1   1   0  ...   0   1   0   1   1   1   0  \
1    1  1.0   1   0   1   0  1.0   1   1   1  ...   1   1   1   1   1   0   1   
2    0  1.0   1   0   1   0  1.0   1   1   0  ...   1   0   1   0   0   1   0   
3    1  0.0   0   1   1   1  1.0   0   0   0  ...   1   0   0   1   0   0   0   
4    1  1.0   0   1   1   1  0.0   1   1   0  ...   0   0   0   0   0   1   0   
5    0  0.0   0   1   1   1  NaN   0   1   0  ...   0   0   0   0   0   1   0   
6    1  1.0   0   0   1   0  1.0   0   1   0  ...   1   0   0   0   0   1   1   
7    1  1.0   1   1   1   1  0.0   0   1   0  ...   1   0   0   0   1   1   0   
8    1  1.0   0   1   1   0  0.0   1   1   0  ...   1   0   0   1   0   1   1   
9    1  1.0   0   1   1   1  1.0   1   1   0  ...   1   0   0   1   1   0   1   
10   1  1.0   1   0   1   0  1.0   1   1   0  ...   1   0   0   1   1   1   1   
11   1  0.0   0   1   1   1 

### scores_2 에서 누락 데이터를 0 으로 채우기 (fillna)

In [76]:
scores_2.fillna(0, inplace=True)
scores_2.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
0,1,0.0,1,0,1,0,1.0,1,1,0,...,0,1,0,1,1,1,0,1,1,1
1,1,1.0,1,0,1,0,1.0,1,1,1,...,1,1,1,1,1,0,1,1,1,1
2,0,1.0,1,0,1,0,1.0,1,1,0,...,1,0,1,0,0,1,0,0,1,1
3,1,0.0,0,1,1,1,1.0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
4,1,1.0,0,1,1,1,0.0,1,1,0,...,0,0,0,0,0,1,0,1,1,1


### scores_2 의 모든 데이터타입을 int32 으로 변경하기 (astype)

In [80]:
scores_2 = scores_2.astype('int32')

### scores_2 의 모든 데이터에 4를 곱한 값으로 재지정하기 (브로드캐스팅)

In [81]:
scores_2 = scores_2 * 4

### sores_2 에 새로운 class열 추가하기(class 데이터 값은 모두 '2반')

In [82]:
scores_2['class'] = '2반'

### scores_1 과 scores_2 를 행기준(axis=0) 으로 합쳐 scores 변수에 저장하기 (concat)

In [85]:
scores = pd.concat([scores_1, scores_2], axis=0, ignore_index=True)
print(scores)

     1   2   3   4   5   6   7   8   9  10  ...  17  18  19  20  21  22  23   
0    0   0   0   0  16   0   0  16   0   0  ...   0   0  16   0  16  16  16  \
1   16  16  16   0  16   0  16  16  16   0  ...   0   0  16  16   0  16  16   
2    0   0  16  16  16  16   0  16   0   0  ...   0   0   0   0   0   0   0   
3   16  16  16  16  16  16  16  16  16   0  ...  16  16  16  16   0  16  16   
4   16  16  16   0  16   0   0  16  16   0  ...   0   0   0  16  16  16  16   
..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..  ..   
71   4   4   4   4   4   4   4   0   4   0  ...   0   0   4   0   4   0   4   
72   4   4   0   0   4   0   4   4   4   0  ...   0   0   4   0   0   4   4   
73   4   0   4   0   4   0   4   0   4   4  ...   0   0   0   4   0   0   0   
74   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
75   4   0   4   0   4   0   0   4   4   0  ...   4   0   4   0   0   4   4   

    24  25  class  
0    0  16     1반  
1    0  16 

### scores에 새로운 id열(values = 1, 2, 3, ...) 추가하기

In [104]:
scores['id'] = range(1, len(scores) + 1)
print(scores)

     1   2   3   4   5   6   7   8   9  10  ...  18  19  20  21  22  23  24   
0    0   0   0   0  16   0   0  16   0   0  ...   0  16   0  16  16  16   0  \
1   16  16  16   0  16   0  16  16  16   0  ...   0  16  16   0  16  16   0   
2    0   0  16  16  16  16   0  16   0   0  ...   0   0   0   0   0   0   0   
3   16  16  16  16  16  16  16  16  16   0  ...  16  16  16   0  16  16  16   
4   16  16  16   0  16   0   0  16  16   0  ...   0   0  16  16  16  16  16   
..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..  ..   
71   4   4   4   4   4   4   4   0   4   0  ...   0   4   0   4   0   4   0   
72   4   4   0   0   4   0   4   4   4   0  ...   0   4   0   0   4   4   4   
73   4   0   4   0   4   0   4   0   4   4  ...   0   0   4   0   0   0   0   
74   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
75   4   0   4   0   4   0   0   4   4   0  ...   0   4   0   0   4   4   0   

    25  class  id  
0   16     1반   1  
1   16     

### class 열의 데이터타입을 category 로 변경하기 (astype)

In [96]:
scores['class'] = scores['class'].astype('category')
print(scores.dtypes)

1           int32
2           int32
3           int32
4           int32
5           int32
6           int32
7           int32
8           int32
9           int32
10          int32
11          int32
12          int32
13          int32
14          int32
15          int32
16          int32
17          int32
18          int32
19          int32
20          int32
21          int32
22          int32
23          int32
24          int32
25          int32
class    category
id          int64
dtype: object


## 3.3 분석하기

### class, id 열을 고정하고, 나머지 열을 피벗하여 scores_long 변수에 저장하기 (melt - var_name='question', value_name='score')

In [97]:
scores_long = pd.melt(scores, id_vars=['class', 'id'], var_name='question', value_name='score')
print(scores_long)

     class  id question  score
0       1반   1        1      0
1       1반   2        1     16
2       1반   3        1      0
3       1반   4        1     16
4       1반   5        1     16
...    ...  ..      ...    ...
1895    2반  72       25      4
1896    2반  73       25      0
1897    2반  74       25      4
1898    2반  75       25      0
1899    2반  76       25      4

[1900 rows x 4 columns]


### scores_long 에서 학생별 점수 구하기 (groupby, sum)

In [100]:
print(scores_long.groupby('id').score.sum())

id
1     160
2     224
3     160
4     352
5     240
     ... 
72     72
73     48
74     52
75      0
76     52
Name: score, Length: 76, dtype: int32


### scores_long 에서 반/학생별 평균 점수 구하여 class_scores 변수에 저장하기 (groupby, sum, reset_index)

In [110]:
class_scores = scores_long.groupby(['class', 'id']).score.sum().reset_index()
print(class_scores)

    class  id  score
0      1반   1    160
1      1반   2    224
2      1반   3    160
3      1반   4    352
4      1반   5    240
..    ...  ..    ...
147    2반  72     72
148    2반  73     48
149    2반  74     52
150    2반  75      0
151    2반  76     52

[152 rows x 3 columns]


### class_scores에서 반별 평균 점수 구하기 (groupby, mean)

In [112]:
print(class_scores.groupby('class').score.mean())

class
1반    103.157895
2반     23.526316
Name: score, dtype: float64


### class_scores에서 반별 기술통계 항목 구하기 (groupby, describe)

In [114]:
print(class_scores.groupby('class').score.describe())

       count        mean         std  min  25%   50%    75%    max
class                                                             
1반      76.0  103.157895  113.720717  0.0  0.0  40.0  192.0  352.0
2반      76.0   23.526316   27.542681  0.0  0.0   0.0   48.0   88.0


# 4. Regular Expression
***

## 4.1 문자열에서 찾기

In [118]:
import pandas as pd
import re

sample = """
750101-1123146 01012345677
900201-2123121 010-2234-7799
950722-2027193 010-2987-7222
000321-3123142 011_1234_9087
001105-4333127 010 4422 1485
0009213876575  01045845987
"""

### 전화번호 추출하기

In [120]:
p = re.compile('\d{3}[-_\s\?\d{4}[-_\s]?\d{4}', re.MULTILINE)
print(p.findall(sample))

['101-1123', '146 0101', '2345677', '201-2123', '010-2234', '799\n9507', '2027193', '010-2987', '222\n0003', '3123142', '011_1234', '087\n0011', '4333127', '010 4422', '485\n0009', '21387657', '01045845']


### 주민등록번호 추출하기

In [123]:
p = re.compile('^\d{6}-?\d{7}', re.MULTILINE)
print(p.findall(sample))

['750101-1123146', '900201-2123121', '950722-2027193', '000321-3123142', '001105-4333127', '0009213876575']


### 여성 주민등록번호 추출하기

In [124]:
p = re.compile('^\d{6}-?[24]\d{6}', re.MULTILINE)
print(p.findall(sample))

['900201-2123121', '950722-2027193', '001105-4333127']


## 4.2 데이터프레임에서 찾기

In [129]:
scientists = pd.read_csv('../data/scientists.csv')
print(scientists)

                   Name        Born        Died  Age          Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
1        William Gosset  1876-06-13  1937-10-16   61        Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5             John Snow  1813-03-15  1858-06-16   45           Physician
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician


### 5월에 태어난 과학자 추출

In [134]:
print(scientists[scientists['Born'].str.match('\d{4}-05-\d{2}')])

                   Name        Born        Died  Age Occupation
2  Florence Nightingale  1820-05-12  1910-08-13   90      Nurse
4         Rachel Carson  1907-05-27  1964-04-14   56  Biologist


### 성(last name)이 C로 시작하는 과학자 추출

In [136]:
print(scientists[scientists['Name'].str.match('\w+\w[cC]\w+')])

                   Name        Born        Died  Age Occupation
2  Florence Nightingale  1820-05-12  1910-08-13   90      Nurse
4         Rachel Carson  1907-05-27  1964-04-14   56  Biologist
