In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## 5.1.1 Series

In [3]:
ser = Series([4,7,-5,3])
ser

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
ser.values

array([ 4,  7, -5,  3])

In [5]:
ser.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [6]:
ser2 = Series([4,7,-5,3], index=['d','b','a','c'])
ser2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
ser2['a']

-5

In [8]:
ser2[2]

-5

In [9]:
ser2['d'] = 6 

In [10]:
ser2[['c','a','d']]

c    3
a   -5
d    6
dtype: int64

In [11]:
ser2[ ser2 > 0]

d    6
b    7
c    3
dtype: int64

In [12]:
'b' in ser2 

True

In [13]:
ser3 = Series( {'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000})
ser3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [36]:
state = ['California', 'Ohio','Oregon','Texas']
ser4 = Series({'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}, index=state)
ser4
# NaN = Not a Number

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [15]:
pd.isnull(ser4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [16]:
pd.notnull(ser4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [17]:
ser4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [79]:
ser4[-ser4.isnull()]

Ohio      35000
Oregon    16000
Texas     71000
dtype: float64

In [19]:
ser3 + ser4

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [20]:
ser4.name = 'population'
ser4.index.name = 'state'
ser4

state
California      NaN
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64

In [21]:
ser4.index = ['CA', 'OH', 'OR', 'TX']
ser4

CA      NaN
OH    35000
OR    16000
TX    71000
Name: population, dtype: float64

### 5.1.2 DataFrame

In [23]:
data = { 'state' : [ 'Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year' : [2000,2001,2002,2001,2002],
       'pop' : [1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [24]:
DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [25]:
frame2 = DataFrame(data, columns=['year', 'state','pop','debt'], index= ['one', 'two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [26]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [27]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [28]:
frame2.ix['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [29]:
frame2.ix[2]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [30]:
type(frame2.year)

pandas.core.series.Series

In [31]:
type(frame2.ix[2])

pandas.core.series.Series

In [32]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [33]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,16.5,True
two,2001,Ohio,1.7,16.5,True
three,2002,Ohio,3.6,16.5,True
four,2001,Nevada,2.4,16.5,False
five,2002,Nevada,2.9,16.5,False


In [34]:
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [37]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [38]:
'state' in frame2.columns

True

In [39]:
'two' in frame2.index

True

In [40]:
frame2.T

Unnamed: 0,one,two,three,four,five
year,2000,2001,2002,2001,2002
state,Ohio,Ohio,Ohio,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9
debt,16.5,16.5,16.5,16.5,16.5


In [41]:
frame2.values

array([[2000, 'Ohio', 1.5, 16.5],
       [2001, 'Ohio', 1.7, 16.5],
       [2002, 'Ohio', 3.6, 16.5],
       [2001, 'Nevada', 2.4, 16.5],
       [2002, 'Nevada', 2.9, 16.5]], dtype=object)

### 5.2.1 Reindex

In [42]:
ser2

d    6
b    7
a   -5
c    3
dtype: int64

In [43]:
ser2.reindex(['a','b','c','d','e'])

a    -5
b     7
c     3
d     6
e   NaN
dtype: float64

In [44]:
ser2.reindex(['a','b','c','d','e'], fill_value=0)

a   -5
b    7
c    3
d    6
e    0
dtype: int64

In [45]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [46]:
frame.reindex(columns=['state','year','pop','debt'])

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
3,Nevada,2001,2.4,
4,Nevada,2002,2.9,


In [47]:
frame.reindex(columns=['state','year','pop','debt'], index=[1,2,4,3,0])

Unnamed: 0,state,year,pop,debt
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
4,Nevada,2002,2.9,
3,Nevada,2001,2.4,
0,Ohio,2000,1.5,


In [48]:
frame.ix[[2,3], ['year','pop']]

Unnamed: 0,year,pop
2,2002,3.6
3,2001,2.4


### DataFrame 연산

In [49]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [50]:
frame2.drop('two')

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [51]:
frame2.drop('debt',axis=1)

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9


In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [53]:
frame2.ix[frame2.year > 2001]

Unnamed: 0,year,state,pop,debt
three,2002,Ohio,3.6,16.5
five,2002,Nevada,2.9,16.5


In [54]:
frame2.ix[frame2.year > 2001, 1:]

Unnamed: 0,state,pop,debt
three,Ohio,3.6,16.5
five,Nevada,2.9,16.5


In [55]:
frame2 + frame2

Unnamed: 0,year,state,pop,debt
one,4000,OhioOhio,3.0,33
two,4002,OhioOhio,3.4,33
three,4004,OhioOhio,7.2,33
four,4002,NevadaNevada,4.8,33
five,4004,NevadaNevada,5.8,33


In [56]:
frame3 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('abc'),
                   index=['Utah','Ohio','Texas','Oregon'])
frame3

Unnamed: 0,a,b,c
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [57]:
series = frame3.ix[0]
series

a    0
b    1
c    2
Name: Utah, dtype: float64

In [58]:
frame3 - series

Unnamed: 0,a,b,c
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


### 함수 적용

In [81]:
frame3

Unnamed: 0,a,b,c
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [59]:
np.mean(frame3)

a    4.5
b    5.5
c    6.5
dtype: float64

In [95]:
np.mean([1,2,3])

2.0

In [60]:
frame3.mean(skipna=False)

a    4.5
b    5.5
c    6.5
dtype: float64

In [61]:
frame3.sum()

a    18
b    22
c    26
dtype: float64

In [62]:
frame3.sum(axis=1)

Utah       3
Ohio      12
Texas     21
Oregon    30
dtype: float64

In [96]:
f = lambda x: x.max() - x.min()
frame3.apply(f)
# p574

a    9
b    9
c    9
dtype: float64

In [64]:
frame3.apply(f, axis=1)

Utah      2
Ohio      2
Texas     2
Oregon    2
dtype: float64

In [65]:
def f(x):
    return Series([x.min(),x.max()], index=['min','max'])

In [66]:
frame3.apply(f)

Unnamed: 0,a,b,c
min,0,1,2
max,9,10,11


In [67]:
def g(x):
    print(x)
    return len(x)

In [68]:
frame3.apply(g)

Utah      0
Ohio      3
Texas     6
Oregon    9
Name: a, dtype: float64
Utah       1
Ohio       4
Texas      7
Oregon    10
Name: b, dtype: float64
Utah       2
Ohio       5
Texas      8
Oregon    11
Name: c, dtype: float64


a    4
b    4
c    4
dtype: int64

In [69]:
divf = lambda x: x / 2
frame3.applymap(divf)

Unnamed: 0,a,b,c
Utah,0.0,0.5,1.0
Ohio,1.5,2.0,2.5
Texas,3.0,3.5,4.0
Oregon,4.5,5.0,5.5


### 정렬

In [70]:
frame3.sort_index()

Unnamed: 0,a,b,c
Ohio,3,4,5
Oregon,9,10,11
Texas,6,7,8
Utah,0,1,2


In [71]:
frame3.sort_index(axis=1, ascending=False)

Unnamed: 0,c,b,a
Utah,2,1,0
Ohio,5,4,3
Texas,8,7,6
Oregon,11,10,9


In [72]:
frame3.c.order(ascending=False)

Oregon    11
Texas      8
Ohio       5
Utah       2
Name: c, dtype: float64

In [73]:
frame3.sort_index(by=['a','b'])

Unnamed: 0,a,b,c
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [74]:
frame3.a.rank()

Utah      1
Ohio      2
Texas     3
Oregon    4
Name: a, dtype: float64

In [75]:
frame3.rank(axis=1)

Unnamed: 0,a,b,c
Utah,1,2,3
Ohio,1,2,3
Texas,1,2,3
Oregon,1,2,3


In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = { 'state' : [ 'Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year' : [2000,2001,2002,2001,2002],
       'pop' : [1.5,1.7,3.6,2.4,2.9]}
frame2 = DataFrame(data, columns=['year', 'state','pop','debt'], index= ['one', 'two','three','four','five'])
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [3]:
# 누락된 데이터 처리
frame2.ix['one',['debt','pop']] = np.nan
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,,
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [4]:
frame2.dropna()

Unnamed: 0,year,state,pop,debt
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [5]:
frame2.fillna(0)

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,0.0,0.0
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [6]:
# 계층적색인
frame4 = DataFrame(np.arange(12).reshape((4,3)),
                   index=[['a','a','b','b'], [1,2,1,2]],
                   columns=[['Ohio','Ohio','Colorado'], ['G','R','G']])
frame4

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,G,R,G
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [7]:
frame4.index.names=['key1','key2']
frame4.columns.names=['state','color']
frame4

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,G,R,G
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [23]:
frame4.ix['a'].ix[2].Ohio.R

4

In [28]:
frame4.Ohio.R.ix['a'].ix[2]

4

In [8]:
frame4.ix['a']

state,Ohio,Ohio,Colorado
color,G,R,G
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


In [9]:
frame4.ix['a'].ix[1]

state     color
Ohio      G        0
          R        1
Colorado  G        2
Name: 1, dtype: int64

In [10]:
frame4.Ohio.G

key1  key2
a     1       0
      2       3
b     1       6
      2       9
Name: G, dtype: int64

In [11]:
frame4.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,G,R,G
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


## 상관분석
https://ko.wikipedia.org/wiki/상관분석
https://en.wikipedia.org/wiki/Correlation_and_dependence
## Excel 방식
http://blog.naver.com/PostView.nhn?blogId=foolhun3&logNo=220305763272

In [13]:
#mobile mau

In [4]:
mail_mmau = pd.read_csv('mail_mmau.csv', index_col='분 류', thousands=',')
mail_mmau

Unnamed: 0_level_0,1월-12,2월-12,3월-12,4월-12,5월-12,6월-12,7월-12,8월-12,9월-12,10월-12,...,10월-14,11월-14,12월-14,1월-15,2월-15,3월-15,4월-15,5월-15,6월-15,7월-15
분 류,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
naver.com,1449426,1318779,1525272,2156043,1935512,2204992,2769000,2753605,2777274,2805178,...,2836273,2691126,2528648,2702249,2512878,2722772,3077679,3046904,3021911,2772323
daum.net,2123572,2098304,1518737,1862319,1872582,1936302,2301563,2135555,1930862,2136730,...,1547458,1624158,1674045,1682265,1478710,1826173,1596559,1599962,1490001,1419051
nate.com,802666,813850,709876,590882,494325,660309,588872,595546,633358,687766,...,369452,352956,440285,324040,299486,317636,344752,241285,254219,311185
google.com,235290,154137,146847,155477,112506,103057,138339,85904,279269,322605,...,302822,384940,382337,357034,398593,530017,340017,316486,354260,300681
Communication-Email,3910870,3663326,3325002,4070259,3787863,4197746,5009222,4807187,4539054,4863945,...,4233829,4212221,4278095,4219187,3939242,4486005,4599696,4426058,4317891,4042735


In [9]:
mmau= mail_mmau.T
mmau

분 류,naver.com,daum.net,nate.com,google.com,Communication-Email
1월-12,1449426,2123572,802666,235290,3910870
2월-12,1318779,2098304,813850,154137,3663326
3월-12,1525272,1518737,709876,146847,3325002
4월-12,2156043,1862319,590882,155477,4070259
5월-12,1935512,1872582,494325,112506,3787863
6월-12,2204992,1936302,660309,103057,4197746
7월-12,2769000,2301563,588872,138339,5009222
8월-12,2753605,2135555,595546,85904,4807187
9월-12,2777274,1930862,633358,279269,4539054
10월-12,2805178,2136730,687766,322605,4863945


In [31]:
mmau.pct_change()

분 류,naver.com,daum.net,nate.com,google.com,Communication-Email
1월-12,,,,,
2월-12,-0.090137,-0.011899,0.013934,-0.344906,-0.063296
3월-12,0.156579,-0.276207,-0.127756,-0.047296,-0.092354
4월-12,0.413547,0.226229,-0.167626,0.058769,0.224137
5월-12,-0.102285,0.005511,-0.163412,-0.276382,-0.06938
6월-12,0.139229,0.034028,0.335779,-0.083987,0.10821
7월-12,0.255787,0.188638,-0.108187,0.342354,0.193312
8월-12,-0.00556,-0.072128,0.011334,-0.379033,-0.040333
9월-12,0.008596,-0.09585,0.063491,2.250943,-0.055778
10월-12,0.010047,0.10662,0.085904,0.155177,0.071577


In [32]:
mmau['daum.net'].cov(mmau['Communication-Email'])

164011059464.21039

In [33]:
mmau['daum.net'].corr(mmau['Communication-Email'])

0.79280894904377797

In [34]:
mmau.corr()

분 류,naver.com,daum.net,nate.com,google.com,Communication-Email
분 류,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
naver.com,1.0,0.358375,-0.381901,0.302069,0.836771
daum.net,0.358375,1.0,0.483446,-0.290624,0.792809
nate.com,-0.381901,0.483446,1.0,-0.54887,0.061057
google.com,0.302069,-0.290624,-0.54887,1.0,0.059398
Communication-Email,0.836771,0.792809,0.061057,0.059398,1.0


In [35]:
#pc mau

In [6]:
mail_pcmau = pd.read_csv('mail_pcmau.csv', index_col='분 류', thousands=',')
mail_pcmau

Unnamed: 0_level_0,9월-02,10월-02,11월-02,12월-02,1월-03,2월-03,3월-03,4월-03,5월-03,6월-03,...,10월-14,11월-14,12월-14,1월-15,2월-15,3월-15,4월-15,5월-15,6월-15,7월-15
분 류,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
naver.com,1939599.0,1927483.0,2324534.0,2477848.0,2576350.0,2885544.0,3091230.0,3381671.0,3299419.0,3894331.0,...,17130578,16857636,16798943,16600515,15483087,16903873,16550268,16710362,16924774,16636861
daum.net,18169858.0,17430340.0,18684232.0,19103068.0,19090580.0,19338630.0,19854052.0,19252372.0,19254174.0,20410178.0,...,13093149,12799077,12749609,12956563,11764914,12903009,12222087,12294511,12025579,12015783
nate.com,556665.0,962698.0,1864338.0,3556092.0,5205320.0,5339015.0,5038476.0,4605165.0,4510060.0,4878930.0,...,5740949,5422418,5294178,5331004,4853505,5067788,4960576,4809377,4956443,4861113
google.com,,,,,,,,,,,...,1377576,1473539,1434281,1544514,1390391,1781983,1645368,1761574,1925899,1938953
communication-mail,20712421.0,20044462.0,20998532.0,21393643.0,21419554.0,21347029.0,21992107.0,21800682.0,21889912.0,22951745.0,...,24359396,24150259,23913457,23865345,22330964,23840936,23416619,23679246,23610765,23304235


In [7]:
pcmau = mail_pcmau.T
pcmau

분 류,naver.com,daum.net,nate.com,google.com,communication-mail
9월-02,1939599,18169858,556665,,20712421
10월-02,1927483,17430340,962698,,20044462
11월-02,2324534,18684232,1864338,,20998532
12월-02,2477848,19103068,3556092,,21393643
1월-03,2576350,19090580,5205320,,21419554
2월-03,2885544,19338630,5339015,,21347029
3월-03,3091230,19854052,5038476,,21992107
4월-03,3381671,19252372,4605165,,21800682
5월-03,3299419,19254174,4510060,,21889912
6월-03,3894331,20410178,4878930,,22951745


# 2차과제
## 1. 각 메일 서비스별 mobile 과 pc 의 mau 상관관계를 분석(2012년 1월~)
http://wiki.daumkakao.com/pages/viewpage.action?pageId=333764026
## 2. 선형회귀 분석을 이용해 서비스별 mobile과 pc의 2015년 8월 mau 예측
10/8일 학습내용 참고: http://wiki.daumkakao.com/download/attachments/333755176/10-8.py?version=1&modificationDate=1444299050000&api=v2

목요일 까지

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

mail_pcmau = pd.read_csv('mail_pcmau.csv', index_col='분 류', thousands=',')
mail_mmau = pd.read_csv('mail_mmau.csv', index_col='분 류', thousands=',')

mmau = mail_mmau.T
pcmau = mail_pcmau.T

pd.concat({'mobile': mmau, 'PC': pcmau}, axis=1, join='inner').corr().PC.ix['mobile']

Unnamed: 0_level_0,Unnamed: 1_level_0,PC,PC,PC,PC,PC,mobile,mobile,mobile,mobile,mobile
Unnamed: 0_level_1,분 류,naver.com,daum.net,nate.com,google.com,communication-mail,naver.com,daum.net,nate.com,google.com,Communication-Email
Unnamed: 0_level_2,분 류,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
PC,naver.com,1.0,0.439784,0.333719,0.040172,0.577233,0.065337,0.157944,0.311487,-0.386705,0.113592
PC,daum.net,0.439784,1.0,0.897975,0.609559,0.973625,-0.22061,0.66591,0.813449,-0.662553,0.240541
PC,nate.com,0.333719,0.897975,1.0,0.687268,0.9011,-0.553288,0.43717,0.813443,-0.705153,-0.107334
PC,google.com,0.040172,0.609559,0.687268,1.0,0.597074,-0.375898,0.344896,0.465158,-0.397874,-0.021826
PC,communication-mail,0.577233,0.973625,0.9011,0.597074,1.0,-0.262742,0.568185,0.795905,-0.702522,0.159496
mobile,naver.com,0.065337,-0.22061,-0.553288,-0.375898,-0.262742,1.0,0.358375,-0.381901,0.302069,0.836771
mobile,daum.net,0.157944,0.66591,0.43717,0.344896,0.568185,0.358375,1.0,0.483446,-0.290624,0.792809
mobile,nate.com,0.311487,0.813449,0.813443,0.465158,0.795905,-0.381901,0.483446,1.0,-0.54887,0.061057
mobile,google.com,-0.386705,-0.662553,-0.705153,-0.397874,-0.702522,0.302069,-0.290624,-0.54887,1.0,0.059398
mobile,Communication-Email,0.113592,0.240541,-0.107334,-0.021826,0.159496,0.836771,0.792809,0.061057,0.059398,1.0


In [100]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

mail_pcmau = pd.read_csv('mail_pcmau.csv', index_col='분 류', thousands=',')
mail_mmau = pd.read_csv('mail_mmau.csv', index_col='분 류', thousands=',')

mmau = mail_mmau.T
pcmau = mail_pcmau.T

mmau


분 류,naver.com,daum.net,nate.com,google.com,Communication-Email
1월-12,1449426,2123572,802666,235290,3910870
2월-12,1318779,2098304,813850,154137,3663326
3월-12,1525272,1518737,709876,146847,3325002
4월-12,2156043,1862319,590882,155477,4070259
5월-12,1935512,1872582,494325,112506,3787863
6월-12,2204992,1936302,660309,103057,4197746
7월-12,2769000,2301563,588872,138339,5009222
8월-12,2753605,2135555,595546,85904,4807187
9월-12,2777274,1930862,633358,279269,4539054
10월-12,2805178,2136730,687766,322605,4863945


In [265]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

mail_pcmau = pd.read_csv('mail_pcmau.csv', index_col='분 류', thousands=',')
mail_mmau = pd.read_csv('mail_mmau.csv', index_col='분 류', thousands=',')

data = pd.concat({'mobile': mail_mmau, 'PC': mail_pcmau}, axis=0, join='inner')

data

Unnamed: 0_level_0,Unnamed: 1_level_0,1월-12,2월-12,3월-12,4월-12,5월-12,6월-12,7월-12,8월-12,9월-12,10월-12,...,10월-14,11월-14,12월-14,1월-15,2월-15,3월-15,4월-15,5월-15,6월-15,7월-15
Unnamed: 0_level_1,분 류,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
PC,naver.com,17121839,16950102,17123838,17756425,17845347,17138097,17276779,17599890,16685681,16754910,...,17130578,16857636,16798943,16600515,15483087,16903873,16550268,16710362,16924774,16636861
PC,daum.net,16228608,15748261,15950497,15967961,16488893,15572506,16291615,16564825,15481871,16125494,...,13093149,12799077,12749609,12956563,11764914,12903009,12222087,12294511,12025579,12015783
PC,nate.com,12271546,11773522,11649994,11917413,11838747,10768330,9908393,9222417,8553294,8685570,...,5740949,5422418,5294178,5331004,4853505,5067788,4960576,4809377,4956443,4861113
PC,google.com,2031192,2023086,1891066,1989751,1953428,1921062,2065869,1878269,1830855,1851246,...,1377576,1473539,1434281,1544514,1390391,1781983,1645368,1761574,1925899,1938953
PC,communication-mail,26844061,26324191,26440470,27000678,27110680,26197903,26994676,26958452,26106865,26055860,...,24359396,24150259,23913457,23865345,22330964,23840936,23416619,23679246,23610765,23304235
mobile,naver.com,1449426,1318779,1525272,2156043,1935512,2204992,2769000,2753605,2777274,2805178,...,2836273,2691126,2528648,2702249,2512878,2722772,3077679,3046904,3021911,2772323
mobile,daum.net,2123572,2098304,1518737,1862319,1872582,1936302,2301563,2135555,1930862,2136730,...,1547458,1624158,1674045,1682265,1478710,1826173,1596559,1599962,1490001,1419051
mobile,nate.com,802666,813850,709876,590882,494325,660309,588872,595546,633358,687766,...,369452,352956,440285,324040,299486,317636,344752,241285,254219,311185
mobile,google.com,235290,154137,146847,155477,112506,103057,138339,85904,279269,322605,...,302822,384940,382337,357034,398593,530017,340017,316486,354260,300681
mobile,Communication-Email,3910870,3663326,3325002,4070259,3787863,4197746,5009222,4807187,4539054,4863945,...,4233829,4212221,4278095,4219187,3939242,4486005,4599696,4426058,4317891,4042735


In [267]:
#y = mmau['naver.com'].values
y = data['1월-12']
print(y)

#print("y:", y)
print(y.shape)

#X = mail_pcmau[:,1:]
X = data
#X

print(X.shape)
#mail_pcmau


#X
#data

#y = pd.DataFrame(data, columns=['daum.net']).values

#y = pd.DataFrame(data, columns=['daum.net', 'nate.com', 'google.com', 'Communication-Email']).values
#y

model = lr.fit(X, y)

print("model.coef_:", model.coef_)
model.predict(X)


        분 류                
PC      naver.com              17121839
        daum.net               16228608
        nate.com               12271546
        google.com              2031192
        communication-mail     26844061
mobile  naver.com               1449426
        daum.net                2123572
        nate.com                 802666
        google.com               235290
        Communication-Email     3910870
Name: 1월-12, dtype: int64
(10,)
(10, 43)
model.coef_: [  3.12570343e-01   2.76242319e-01   1.31119430e-01   1.36104476e-01
   7.43133506e-02   1.21563150e-01   7.60535827e-02  -6.43575460e-03
  -7.55555608e-03   3.08685047e-02  -9.01901647e-03  -3.71564142e-02
   3.84995281e-02  -2.85248815e-03   5.27330096e-02   4.50303085e-05
  -7.97518121e-02  -3.24205908e-03   5.72330747e-02   5.95520511e-03
  -4.95378868e-02  -1.16471651e-01   1.34740093e-02  -5.23294990e-02
  -3.38716175e-02  -1.46491225e-02  -3.27346377e-02   4.40859935e-02
  -3.34056392e-02  -4.39059949e-02 

array([ 17121839.00000014,  16228608.00000009,  12271545.99999999,
         2031191.99999991,  26844061.00000029,   1449425.99999993,
         2123571.99999991,    802665.99999989,    235289.99999989,
         3910869.99999995])

In [268]:
print("shape:", model.predict(X).shape)

# 모델의 정확도 평가. 1 이 perfect
print("모델의 정확도 평가: ", model.score(X,y))

# 모델의 정확도 평가. 분산값. 클수록 정확도가 나쁜 모델
print("모델의 정확도 평가. 분산값. 클수록 정확도가 나쁜 모델: ", (np.mean((model.predict(X) - y) ** 2)))

shape: (10,)
모델의 정확도 평가:  1.0
모델의 정확도 평가. 분산값. 클수록 정확도가 나쁜 모델:  1.6058314973e-14
