In [1]:
import numpy as np
import pandas as pd

In [3]:
obj = pd.Series(range(4),index=['d','a','b','c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [4]:
obj.sort_index() # index sorting

a    1
b    2
c    3
d    0
dtype: int64

In [15]:
df = pd.DataFrame(np.arange(8).reshape(2,4),
            index=['three','one'],
            columns=['d','a','b','c'])
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [16]:
df.sort_index() # 행 인덱스 정렬 (axis=0, default)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [17]:
df.sort_index(axis=1) # 열 인덱스 정렬 (오름차순)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [18]:
df.sort_index(axis=1, ascending=False)  # 열 인덱스 정렬 (내림차순)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [20]:
obj = pd.Series([-3,6,-2,1])
obj

0   -3
1    6
2   -2
3    1
dtype: int64

In [21]:
obj.sort_values(ascending=False) # 값 기준 내림 차순 정렬

1    6
3    1
2   -2
0   -3
dtype: int64

In [22]:
obj = pd.Series([-3,np.nan,6,np.nan,-2,1])
obj.sort_values() # nan이 가장 마지막에 정렬

0   -3.0
4   -2.0
5    1.0
2    6.0
1    NaN
3    NaN
dtype: float64

In [23]:
obj.sort_values(ascending=False) # 내림차순도 마찬가지로 nan이 마지막에 나옴 (비교대상이 아님)

2    6.0
5    1.0
4   -2.0
0   -3.0
1    NaN
3    NaN
dtype: float64

In [25]:
df = pd.DataFrame({'b':[4,7,-2,2], 'a' :[0,1,0,1]})
df.sort_values(by='b') # b 기준으로 정렬

Unnamed: 0,b,a
2,-2,0
3,2,1
0,4,0
1,7,1


In [26]:
df.sort_values(by='a')

Unnamed: 0,b,a
0,4,0
2,-2,0
1,7,1
3,2,1


In [27]:
df.sort_values(by=['a','b']) # a로 정렬하고, 값이 같으면 b를 기준으로 정렬하라

Unnamed: 0,b,a
2,-2,0
0,4,0
3,2,1
1,7,1


In [31]:
obj = pd.Series([7,-3,7,4,2,0,4])
obj

0    7
1   -3
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [30]:
obj.rank() # 오름차순 정렬에 따른  순위 부과
# 동점의 경우에는 평균 순위 매김

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [32]:
obj.rank(method="first")
# 동점의 경우에는 순서에 따라 순위를 매김  ( 먼저나온게 순위 더 높음)

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [33]:
# 인덱스 중복
obj = pd.Series(range(5), index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [34]:
obj.index.is_unique
# 함수명 접두어 get~ set ~ is ~
# get~ 은 뭔가를 가져오는  set~ (데이터를 저장하는)
# is~ (True/False)

False

In [35]:
obj['a']

a    0
a    1
dtype: int64

In [36]:
type(obj['a']) #  중복이 되는 데이터는 타입이 Series로 출력

pandas.core.series.Series

In [37]:
type(obj['c']) # 중복이 되지 않는 데이터는 스칼라로 출력

numpy.int64

In [38]:
df = pd.DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,0.541415,-1.920315,1.091203
a,-0.150297,-0.465562,-0.750606
b,-1.111186,0.624114,1.54683
b,2.073452,-0.745514,1.129099


In [39]:
# b 에 해당하는 index 뽑고 싶다
df.loc['b'] # label로 뽑음

Unnamed: 0,0,1,2
b,-1.111186,0.624114,1.54683
b,2.073452,-0.745514,1.129099


In [None]:
# 수학/통계 메서드

In [41]:
df = pd.DataFrame([[1.5,np.nan],
             [7.0,4.5],
             [np.nan, np.nan],
             [0.7, -1.5]],
            index=['a','b','c','d'],
            columns=['one','two'])
df

Unnamed: 0,one,two
a,1.5,
b,7.0,4.5
c,,
d,0.7,-1.5


In [42]:
df.sum() # nan 제외하고 데이터들에 대해 합계 계산
# 계산 결과가 시리즈로 나옴
df.sum(axis=0)

one    9.2
two    3.0
dtype: float64

In [43]:
df.sum(axis=1)
df.sum(axis=1, skipna=True) # 디폴트

a     1.5
b    11.5
c     0.0
d    -0.8
dtype: float64

In [44]:
df.sum(axis=1, skipna=False) # NAN 제외되지 않음

a     NaN
b    11.5
c     NaN
d    -0.8
dtype: float64

In [45]:
print(df)
df.mean(axis=1)

   one  two
a  1.5  NaN
b  7.0  4.5
c  NaN  NaN
d  0.7 -1.5


a    1.50
b    5.75
c     NaN
d   -0.40
dtype: float64

In [46]:
df.mean(axis=1, skipna=False) #nan을 포함하여 연산

a     NaN
b    5.75
c     NaN
d   -0.40
dtype: float64

In [47]:
df.idxmax() # 최대값에 해당하는 인덱스

one    b
two    b
dtype: object

In [48]:
df.idxmin() # 최소값에 해당하는 인덱스

one    d
two    d
dtype: object

In [49]:
df.cumsum() # 누적합

Unnamed: 0,one,two
a,1.5,
b,8.5,4.5
c,,
d,9.2,3.0


In [50]:
df.describe() # 기술 통계치 : 수치데이터 데이터프레임, 문자데이터

Unnamed: 0,one,two
count,3.0,2.0
mean,3.066667,1.5
std,3.429772,4.242641
min,0.7,-1.5
25%,1.1,0.0
50%,1.5,1.5
75%,4.25,3.0
max,7.0,4.5


In [51]:
obj = pd.Series(['a','a','b','c']*4) # 4번 반복 시리즈 (길이 16)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [52]:
obj.describe() # 요약 통계  수치 >> 기술 통계   /  문자 >> 요약 통계

count     16
unique     3
top        a
freq       8
dtype: object

In [54]:
!pip install pandas-datareader

Collecting pandas-datareader
  Downloading pandas_datareader-0.9.0-py3-none-any.whl (107 kB)
Installing collected packages: pandas-datareader
Successfully installed pandas-datareader-0.9.0


In [55]:
# 상관관계, 공분산
# 야후 금융 사이트에서 주식가격/시가총액 데이터프레임 제공
import pandas_datareader.data as web

In [56]:
allData = {ticker : web.get_data_yahoo(ticker) for ticker in ['AAPL','IBM','MSFT','GOOG']} # yahoo 파이낸셜에서 이 주식들을 가져오겟다
#ticker : web.get_data_yahoo(ticker)  이게 저장됨 

In [58]:
allData.keys()

dict_keys(['AAPL', 'IBM', 'MSFT', 'GOOG'])

In [59]:
allData

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2016-02-16   24.212500   23.652500   23.754999   24.160000  196231600.0   
 2016-02-17   24.552500   24.037500   24.167500   24.530001  179452800.0   
 2016-02-18   24.722500   24.022499   24.709999   24.065001  156084000.0   
 2016-02-19   24.190001   23.950001   24.000000   24.010000  141496800.0   
 2016-02-22   24.225000   23.980000   24.077499   24.219999  137123200.0   
 ...                ...         ...         ...         ...          ...   
 2021-02-05  137.419998  135.860001  137.350006  136.759995   75524000.0   
 2021-02-08  136.960007  134.919998  136.029999  136.910004   71297200.0   
 2021-02-09  137.880005  135.850006  136.619995  136.009995   76774200.0   
 2021-02-10  136.990005  134.399994  136.479996  135.389999   73046600.0   
 2021-02-11  136.389999  133.770004  135.899994  135.130005   64154400.0   
 
  

In [60]:
allData.items() # 회사이름, 주식데이터 형식 # dict_items list

dict_items([('AAPL',                   High         Low        Open       Close       Volume  \
Date                                                                      
2016-02-16   24.212500   23.652500   23.754999   24.160000  196231600.0   
2016-02-17   24.552500   24.037500   24.167500   24.530001  179452800.0   
2016-02-18   24.722500   24.022499   24.709999   24.065001  156084000.0   
2016-02-19   24.190001   23.950001   24.000000   24.010000  141496800.0   
2016-02-22   24.225000   23.980000   24.077499   24.219999  137123200.0   
...                ...         ...         ...         ...          ...   
2021-02-05  137.419998  135.860001  137.350006  136.759995   75524000.0   
2021-02-08  136.960007  134.919998  136.029999  136.910004   71297200.0   
2021-02-09  137.880005  135.850006  136.619995  136.009995   76774200.0   
2021-02-10  136.990005  134.399994  136.479996  135.389999   73046600.0   
2021-02-11  136.389999  133.770004  135.899994  135.130005   64154400.0   

   

In [61]:
price = pd.DataFrame({ticker: data['Adj Close'] for ticker,data in allData.items()} )# Adj Cloas > 종가 자료구조는 딕셔너리 
price
# allData
# ticker : 회사이름 / data : 종가 데이터 만 추출

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-02-16,22.471428,98.425911,46.955994,691.000000
2016-02-17,22.815565,101.120300,48.178375,708.400024
2016-02-18,22.383070,106.212402,47.966984,697.349976
2016-02-19,22.331917,106.717606,47.626919,700.909973
2016-02-22,22.527231,107.270927,48.389759,706.460022
...,...,...,...,...
2021-02-05,136.759995,120.183998,242.199997,2098.000000
2021-02-08,136.910004,121.980003,242.470001,2092.909912
2021-02-09,136.009995,122.099998,243.770004,2083.510010
2021-02-10,135.389999,122.239998,242.820007,2095.379883


In [62]:
volume = pd.DataFrame({ticker: data['Volume'] for ticker,data in allData.items()} )
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-02-16,196231600.0,4061400.0,37291200.0,2520000
2016-02-17,179452800.0,4821000.0,40789000.0,2492600
2016-02-18,156084000.0,9951600.0,27176000.0,1883200
2016-02-19,141496800.0,5125300.0,33559100.0,1589300
2016-02-22,137123200.0,4457400.0,25008300.0,1949800
...,...,...,...,...
2021-02-05,75524000.0,4563600.0,18043900.0,1533900
2021-02-08,71297200.0,5888000.0,22211900.0,1241900
2021-02-09,76774200.0,4717000.0,23565000.0,889900
2021-02-10,73046600.0,4866800.0,22186700.0,1135500


In [63]:
price.describe() #기술통계

Unnamed: 0,AAPL,IBM,MSFT,GOOG
count,1258.0,1258.0,1258.0,1258.0
mean,53.013971,125.573398,111.679675,1116.458318
std,28.80502,9.601952,53.823573,288.025719
min,21.134403,89.788025,44.822552,668.26001
25%,34.591587,120.781502,65.033491,914.542511
50%,43.476748,125.96254,100.79966,1097.269958
75%,61.232233,130.650291,142.246437,1248.649963
max,142.946396,151.120636,244.490005,2098.0


In [64]:
df = pd.DataFrame({"삼성전자":[52200,52300,52900,52000,51700],
              " LG전자":[68200,67800,68800,67500,66300]})
df

Unnamed: 0,삼성전자,LG전자
0,52200,68200
1,52300,67800
2,52900,68800
3,52000,67500
4,51700,66300


In [65]:
# 수익률 계산 함수
df.pct_change() # 

Unnamed: 0,삼성전자,LG전자
0,,
1,0.001916,-0.005865
2,0.011472,0.014749
3,-0.017013,-0.018895
4,-0.005769,-0.017778


In [66]:
df.pct_change() * 100 # 수익률

Unnamed: 0,삼성전자,LG전자
0,,
1,0.191571,-0.58651
2,1.147228,1.474926
3,-1.701323,-1.889535
4,-0.576923,-1.777778


In [67]:
df.pct_change(periods=2)*100 # 기준 이틀 0번과 2번의 비교 수익률

Unnamed: 0,삼성전자,LG전자
0,,
1,,
2,1.340996,0.879765
3,-0.573614,-0.442478
4,-2.268431,-3.633721


In [68]:
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-02-16,22.471428,98.425911,46.955994,691.000000
2016-02-17,22.815565,101.120300,48.178375,708.400024
2016-02-18,22.383070,106.212402,47.966984,697.349976
2016-02-19,22.331917,106.717606,47.626919,700.909973
2016-02-22,22.527231,107.270927,48.389759,706.460022
...,...,...,...,...
2021-02-05,136.759995,120.183998,242.199997,2098.000000
2021-02-08,136.910004,121.980003,242.470001,2092.909912
2021-02-09,136.009995,122.099998,243.770004,2083.510010
2021-02-10,135.389999,122.239998,242.820007,2095.379883


In [69]:
path = 'C:/Users/USER/Desktop/dataset/deepL/pandas_data/'

In [70]:
visited = pd.read_csv(path+'survey_visited.csv')
survey = pd.read_csv(path+'survey_survey.csv')

In [71]:
print(visited)
print(survey)

   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22
    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25


In [72]:
vs = visited.merge(survey, left_on='ident',right_on='taken') # ident 와 taken에 대해
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading
0,619,DR-1,1927-02-08,619,dyer,rad,9.82
1,619,DR-1,1927-02-08,619,dyer,sal,0.13
2,622,DR-1,1927-02-10,622,dyer,rad,7.8
3,622,DR-1,1927-02-10,622,dyer,sal,0.09
4,734,DR-3,1939-01-07,734,pb,rad,8.41
5,734,DR-3,1939-01-07,734,lake,sal,0.05
6,734,DR-3,1939-01-07,734,pb,temp,-21.5
7,735,DR-3,1930-01-12,735,pb,rad,7.22
8,735,DR-3,1930-01-12,735,,sal,0.06
9,735,DR-3,1930-01-12,735,,temp,-26.0


In [73]:
n = pd.Series({'goat':4, 'ameba':np.nan})
n

goat     4.0
ameba    NaN
dtype: float64

In [74]:
type(n)

pandas.core.series.Series

In [75]:
sci = pd.DataFrame({
    'name':['Rosa','Will'],
    'occu':['Scientist','Chemist'],
    'missing':[np.NaN,np.nan]
})
sci

Unnamed: 0,name,occu,missing
0,Rosa,Scientist,
1,Will,Chemist,


In [77]:
gap = pd.read_csv(path+'gapminder.tsv',sep="\t")
gap

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [78]:
gap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [79]:
gap.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165876
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846988
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


In [80]:
gap.groupby(['year'])['lifeExp'] 

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000015D9089B040>

In [81]:
life_exp = gap.groupby(['year'])['lifeExp'].mean()
life_exp

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [82]:
type(life_exp)

pandas.core.series.Series

In [86]:
life_exp.iloc[0]
life_exp[1952]
life_exp.iloc[-12]

49.05761971830987

In [87]:
life_exp[life_exp.index>2000]
# 2000 ~ 2009 

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [88]:
ebola = pd.read_csv(path+'country_timeseries.csv')
ebola

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,


In [89]:
ebola.shape # 122행

(122, 18)

In [90]:
ebola.count()

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64

In [93]:
ebola.shape[0] - ebola.count() # 브로드 캐스팅

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64

In [94]:
numMissing = ebola.shape[0] - ebola.count()
numMissing

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64

In [96]:
np.count_nonzero(ebola.isnull()) # eblola.isnull()  대해 zero가 아닌 개수 세는것

1214

In [98]:
np.count_nonzero(ebola['Cases_Guinea'].isnull())
#null이 29개, zero : false, nonzero:true => count_nonzero는 true개수

29

In [99]:
ebola.info().iloc[0:5]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 122 non-null    object 
 1   Day                  122 non-null    int64  
 2   Cases_Guinea         93 non-null     float64
 3   Cases_Liberia        83 non-null     float64
 4   Cases_SierraLeone    87 non-null     float64
 5   Cases_Nigeria        38 non-null     float64
 6   Cases_Senegal        25 non-null     float64
 7   Cases_UnitedStates   18 non-null     float64
 8   Cases_Spain          16 non-null     float64
 9   Cases_Mali           12 non-null     float64
 10  Deaths_Guinea        92 non-null     float64
 11  Deaths_Liberia       81 non-null     float64
 12  Deaths_SierraLeone   87 non-null     float64
 13  Deaths_Nigeria       38 non-null     float64
 14  Deaths_Senegal       22 non-null     float64
 15  Deaths_UnitedStates  18 non-null     flo

AttributeError: 'NoneType' object has no attribute 'iloc'

In [100]:
ebola.fillna(0).iloc[0:5,0:5]

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,0.0,10030.0
1,1/4/2015,288,2775.0,0.0,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,0.0,8157.0,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0


In [101]:
ebola.fillna(method='ffill') 

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,2769.0,8157.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,66.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
118,3/26/2014,4,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,62.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
119,3/25/2014,3,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,60.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
120,3/24/2014,2,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,59.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0


In [102]:
ebola.fillna(method='bfill')

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,8166.0,10030.0,20.0,1.0,4.0,1.0,7.0,1786.0,3496.0,2977.0,8.0,0.0,1.0,0.0,6.0
1,1/4/2015,288,2775.0,8166.0,9780.0,20.0,1.0,4.0,1.0,7.0,1781.0,3496.0,2943.0,8.0,0.0,1.0,0.0,6.0
2,1/3/2015,287,2769.0,8166.0,9722.0,20.0,1.0,4.0,1.0,7.0,1767.0,3496.0,2915.0,8.0,0.0,1.0,0.0,6.0
3,1/2/2015,286,2730.0,8157.0,9633.0,20.0,1.0,4.0,1.0,7.0,1739.0,3496.0,2827.0,8.0,0.0,1.0,0.0,6.0
4,12/31/2014,284,2730.0,8115.0,9633.0,20.0,1.0,4.0,1.0,7.0,1739.0,3471.0,2827.0,8.0,0.0,1.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,


In [103]:
ebola.interpolate() # 사이값 평균으로 넣음 Cases_Guinea 

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,2749.5,8157.0,9677.5,,,,,,1753.0,3496.0,2871.0,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,66.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
118,3/26/2014,4,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,62.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
119,3/25/2014,3,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,60.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
120,3/24/2014,2,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,59.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0


In [104]:
ebola.shape

(122, 18)

In [105]:
#ebola 데이터에 대해 하나라도 nan이 있다면 제거
ebolaDropna = ebola.dropna()

In [106]:
ebolaDropna.shape

(1, 18)

In [107]:
ebolaDropna

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
19,11/18/2014,241,2047.0,7082.0,6190.0,20.0,1.0,4.0,1.0,6.0,1214.0,2963.0,1267.0,8.0,0.0,1.0,0.0,6.0


In [108]:
# 기니, 라이베리아, 시에라리온 세 국가의 발병지수를 모두 더한 다음
# 새롭게 Cases_multiple 이라는 컬럼을 생성하고 저장하시오
# ebola.info(0)
ebola['Cases_multiple'] = ebola.Cases_Guinea+ ebola.Cases_SierraLeone+ ebola.Cases_Liberia

In [109]:
# 기니 라이베리아 시에라리온 ,multiple column 값만 추출하여
# ebola_subset 이라는 데이터 프레임 만드시오
ebola_subset = ebola.loc[:,['Cases_Guinea','Cases_SierraLeone','Cases_Liberia','Cases_multiple']]
ebola_subset

Unnamed: 0,Cases_Guinea,Cases_SierraLeone,Cases_Liberia,Cases_multiple
0,2776.0,10030.0,,
1,2775.0,9780.0,,
2,2769.0,9722.0,8166.0,20657.0
3,,,8157.0,
4,2730.0,9633.0,8115.0,20478.0
...,...,...,...,...
117,103.0,6.0,8.0,117.0
118,86.0,,,
119,86.0,,,
120,86.0,,,


In [110]:
ebola.Cases_Guinea.sum()

84729.0

In [111]:
ebola.Cases_Guinea.sum()

84729.0