<div style="font-size:18pt; padding-top:20px; text-align:center">СЕМИНАР 6. <b>Корреляция и </b> <span style="font-weight:bold; color:green">NumPy/SciPy</span></div><hr>
<div style="text-align:right;">Папулин С.Ю. <span style="font-style: italic;font-weight: bold;">(papulin_hse@mail.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Содержание</span>
    <ol>
        <li><a href="#1">Корреляция Пирсона</a></li>
        <li><a href="#2">Примеры</a>
        <li><a href="#3">Источники</a>
        </li>
    </ol>
</div>

<p><b>Подлючение библиотек</b></p>

In [None]:
import pandas as pnd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
%matplotlib inline

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Корреляция Пирсона</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

Коэффициент корреляции Пирсона определяет линейную функциональную зависимость между двумя переменными

$$r=\frac{cov(x,y)}{\sigma_x\sigma_y}=\frac{\sum_{k=1}^{N}(x_k - \mu_x)(y_k - \mu_y)}{\sqrt{\sum_{k=1}^{N}(x_k - \mu_x)^2\sum_{k=1}^{N}(y_k - \mu_y)^2}}$$

$$-1 \le r \le 1$$

Матричная запись

$$R_{i,j}=\frac{\sum_{k=1}^{N}(X_{k,i} - \mu_{x_i})(X_{k,j} - \mu_{x_j})}{\sqrt{\sum_{k=1}^{N}(X_{k,i} - \mu_{x_i})^2\sum_{k=1}^{N}(X_{k,j} - \mu_{x_j})^2}}$$

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">2. Примеры</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<p>Загрузка данных</p>

In [None]:
dfBrend = pnd.read_csv("BZ_010201_160207.csv", sep=";", header=None, skiprows=1, parse_dates=True, usecols=[2,7], 
                       names = ["Date","Price"], index_col=0)
dfBrend.T

In [None]:
dfGDP = pnd.read_csv("GDP.csv", sep=";", encoding = "iso-8859-1", header=None, parse_dates=True, usecols=[0,1], 
                     names=["Date", "GDP"], index_col=0)
dfGDP.index = dfGDP.index.year
dfGDP.sort_index(inplace=True, ascending = 1)
dfGDP.T

In [None]:
dfUR = pnd.read_csv("USDRUB_010201_160207.csv", sep=";", header=None, skiprows=1, parse_dates=True, usecols=[2,7], 
                       names = ["Date","USDRUB"], index_col=0)
dfUR.T

In [None]:
dfER = pnd.read_csv("EURRUB_010201_160208.csv", sep=";", header=None, skiprows=1, parse_dates=True, usecols=[2,7], 
                       names = ["Date","EURRUB"], index_col=0)
dfER.T

In [None]:
plt.figure(1, figsize=[14,8])

plt.subplot(2,2,1)
plt.title("Brend Oil Price by month")
plt.plot(dfBrend.index, dfBrend)
plt.grid(True)

plt.subplot(2,2,2)
plt.title("Gross Domestic Product by year")
plt.plot(dfGDP.index, dfGDP)
plt.grid(True)

plt.subplot(2,2,3)
plt.title("USDRUB by month")
plt.plot(dfUR.index, dfUR)
plt.grid(True)

plt.subplot(2,2,4)
plt.title("EURRUB by month")
plt.plot(dfER.index, dfER)
plt.grid(True)

plt.show()

<p>Вывод данных по году</p>

In [None]:
dfBrend.loc["2001"]

<p>Среднее значение за год</p>

In [None]:
dfBrend.loc['2008'].mean()

<p>Средние значения с 2001 по 2016</p>

In [None]:
dfBrend_Year = dfBrend.groupby(dfBrend.index.year).mean()
dfBrend_Year.T

In [None]:
dfUR_Year = dfUR.groupby(dfUR.index.year).mean()
dfUR_Year.T

In [None]:
dfER_Year = dfER.groupby(dfER.index.year).mean()
dfER_Year.T

In [None]:
plt.figure(1, figsize=[14,8])

plt.subplot(2,2,1)
plt.title("Brend Oil Price by year")
plt.plot(dfBrend_Year.index, dfBrend_Year, "o-")
plt.grid(True)

plt.subplot(2,2,2)
plt.title("Gross Domestic Product by year")
plt.plot(dfGDP.index, dfGDP, "o-")
plt.grid(True)

plt.subplot(2,2,3)
plt.title("USDRUB by year")
plt.plot(dfUR_Year.index, dfUR_Year, "o-")
plt.grid(True)

plt.subplot(2,2,4)
plt.title("EURRUB by year")
plt.plot(dfER_Year.index, dfER_Year, "o-")
plt.grid(True)

plt.show()

<p>Выбор рассматриваемого диапазона</p>

In [None]:
st = pnd.Timestamp('2002').year
end = pnd.Timestamp('2012').year

In [None]:
d = {"Brend": dfBrend_Year.loc[st:end,"Price"], 
     "GDP":dfGDP.loc[st:end,"GDP"],
     "USDRUB":dfUR_Year.loc[st:end,"USDRUB"],
     "EURRUB":dfER_Year.loc[st:end,"EURRUB"],
    }
dfAll = pnd.DataFrame(d)
dfAll

<p>Корреляция</p>

In [None]:
#Pandas
dfAll.corr(method="pearson") #‘pearson’, ‘kendall’, ‘spearman’

In [None]:
#NumPy
np.corrcoef(dfAll.T)

In [None]:
#SciPy
pearson_coef, pvalue = stats.pearsonr(dfAll.GDP, dfAll.Brend)
pearson_coef

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Brend Oil Price by year")
plt.plot(dfAll.index, dfAll.Brend, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("Gross Domestic Product by year")
plt.plot(dfAll.index, dfAll.GDP, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Brend vs GDP")
plt.plot(dfAll.Brend, dfAll.GDP, "o")
plt.grid(True)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Brend Oil Price by year")
plt.plot(dfAll.index, dfAll.Brend, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("EURRUB by year")
plt.plot(dfAll.index, dfAll.EURRUB, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Brend vs EURRUB")
plt.plot(dfAll.Brend, dfAll.EURRUB, "o")
plt.grid(True)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Brend Oil Price by year")
plt.plot(dfAll.index, dfAll.Brend, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("USDRUB by year")
plt.plot(dfAll.index, dfAll.USDRUB, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("Brend vs USDRUB")
plt.plot(dfAll.Brend, dfAll.USDRUB, "o")
plt.grid(True)

plt.show()

In [None]:
plt.figure(1, figsize=[15,4])

plt.subplot(1,3,1)
plt.title("Gross Domestic Product by year")
plt.plot(dfAll.index, dfAll.GDP, "o-")
plt.grid(True)

plt.subplot(1,3,2)
plt.title("USDRUB by year")
plt.plot(dfAll.index, dfAll.USDRUB, "o-")
plt.grid(True)

plt.subplot(1,3,3)
plt.title("GDP vs USDRUB")
plt.plot(dfAll.GDP, dfAll.USDRUB, "o")
plt.xticks(rotation='vertical')
plt.grid(True)

plt.show()

In [None]:
plt.plot(dfGDP.index, dfGDP, "o-")
plt.xlabel("Year", size=14)
plt.ylabel("GDP, BN RUB", size=14)
plt.title("GDP(Year) in Russia", size=18)
plt.grid(True)
plt.xticks(dfGDP.index, rotation='vertical')
plt.show()

<a name="6"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">6. Источники</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">К содержанию</a></div>
    </div>
</div>

<a href="http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html">pandas.DataFrame.corr</a><br>
<a href="http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.corrcoef.html">numpy.corrcoef</a><br>
<a href="http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html">scipy.stats.pearsonr</a><br>

<a href="https://msdn.microsoft.com/ru-ru/library/azure/dn905819.aspx">Вычисления линейной корреляции</a><br>