 #### ДЗ
1. Разделить дата сет на трейн и тест в отношение 50:50 70:30 80:20 (с перемешиванием)
2. Обучать наши модели на трейне. Предсказывать и замерять метрику R^2 и на трейне и на тесте
3. Проверить следующие модели, для каждого разделения:
    а) sales ~ log_tv + radio
    б) sales ~ TV + radio
    в) sales ~ TV + radio + newspaper

In [144]:
# Common imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
#import statsmodels.formula.api as smf
import random
from sklearn.linear_model import LinearRegression
import math
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler


import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

In [161]:
#download_file("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
adv_df = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', usecols=[1,2,3,4])
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


#### 1. У нас нет NaN значений и все измерения представлены в float64.  

In [162]:
adv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
TV           200 non-null float64
radio        200 non-null float64
newspaper    200 non-null float64
sales        200 non-null float64
dtypes: float64(4)
memory usage: 6.3 KB


In [163]:
adv_df['log_tv'] = adv_df.TV.apply(lambda x: math.log(x, 2))

In [164]:
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales,log_tv
0,230.1,37.8,69.2,22.1,7.846117
1,44.5,39.3,45.1,10.4,5.475733
2,17.2,45.9,69.3,9.3,4.104337
3,151.5,41.3,58.5,18.5,7.243174
4,180.8,10.8,58.4,12.9,7.498251


#### 2. Нормализуем данные

In [165]:
adv_df = StandardScaler().fit_transform(adv_df)

In [166]:
adv_df = pd.DataFrame(adv_df, columns=['TV', 'radio', 'newspaper', 'sales', 'log_tv']).astype('float64')

In [167]:
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales,log_tv
0,0.969852,0.981522,1.778945,1.552053,0.764261
1,-1.197376,1.082808,0.669579,-0.696046,-0.874044
2,-1.516155,1.528463,1.783549,-0.907406,-1.821893
3,0.05205,1.217855,1.286405,0.86033,0.347533
4,0.394182,-0.841614,1.281802,-0.215683,0.523831


__________________
__________________
## Разделим датасет на трейн и тест в отношение 50:50

In [172]:
from sklearn.model_selection import train_test_split

adv_train, adv_test = train_test_split(adv_df, test_size=0.5, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(adv_train))
print("Total transactions in test dataset: ", len(adv_test))

Total transactions in train dataset:  100
Total transactions in test dataset:  100


In [173]:
adv_train.head()

Unnamed: 0,TV,radio,newspaper,sales,log_tv
4,0.394182,-0.841614,1.281802,-0.215683,0.523831
32,-0.582005,-1.469583,-0.025502,-0.849762,-0.095007
142,0.857754,0.670914,0.33815,1.167763,0.721767
145,-0.078731,-1.442573,-0.99217,-0.715261,0.270951
109,1.265277,0.245516,-1.153281,1.110119,0.868278


### Определим фичи и таргет переменную для каждого варианта
1. sales ~ log_tv + radio 
2. sales ~ TV + radio 
3. sales ~ TV + radio + newspaper

In [174]:
x1_train = adv_train.filter(items = ['log_tv', 'radio'])
x2_train = adv_train.filter(items = ['TV', 'radio'])
x3_train = adv_train.filter(items = ['TV', 'radio','newspaper'])

y_train = adv_train['sales']

In [175]:
x1_train.head()

Unnamed: 0,log_tv,radio
4,0.523831,-0.841614
32,-0.095007,-1.469583
142,0.721767,0.670914
145,0.270951,-1.442573
109,0.868278,0.245516


### Обучим модели с разными вариантами фичей

In [176]:
#three_x_lm = sm.OLS.from_formula("sales ~ log_tv + radio", adv_train)
#расчет метрик
# print("RSS:", np.sum(skm.resid ** 2))
# print("RSE:", np.sqrt(np.sum(skm.resid ** 2)) / (adv_train.shape[0] - 2 - 1))
# print("R^2:", skm.rsquared)


skm1 = lm.LinearRegression()
# calculate parameters
skm1.fit(x1_train, y_train)
# show them
skm1.intercept_, skm1.coef_


(0.027066724645702325, array([0.69463157, 0.59825458]))

In [177]:
skm2 = lm.LinearRegression()
# calculate parameters
skm2.fit(x2_train, y_train)
# show them
skm2.intercept_, skm2.coef_

(0.011302095880275864, array([0.72059786, 0.58657534]))

In [178]:
skm3 = lm.LinearRegression()
# calculate parameters
skm3.fit(x3_train, y_train)
# show them
skm3.intercept_, skm3.coef_

(0.009782331467812877, array([0.71736366, 0.56706127, 0.06120647]))

### Посчитаем R^2 для train

In [179]:
print('Для sales ~ log_tv + radio, R^2 train = ', skm1.score(x1_train, y_train, sample_weight=None))
print('Для sales ~ TV + radio, R^2 train = ', skm2.score(x2_train, y_train, sample_weight=None))
print('Для sales ~ TV + radio + newspaper, R^2 train = ', skm3.score(x3_train, y_train, sample_weight=None))

Для sales ~ log_tv + radio, R^2 train =  0.8997594187070234
Для sales ~ TV + radio, R^2 train =  0.9020506014720118
Для sales ~ TV + radio + newspaper, R^2 train =  0.9042613648908893


### Посчитаем R^2 для test

In [180]:
print('Для sales ~ log_tv + radio, R^2 train {:.7f}'.format(
skm1.score(adv_test[['log_tv','radio']], adv_test['sales'])))

print('Для sales ~ TV + radio, R^2 train {:.7f}'.format(
skm2.score(adv_test[['TV','radio']], adv_test['sales'])))

print('Для sales ~ TV + radio, R^2 train {:.7f}'.format(
skm3.score(adv_test[['TV','radio','newspaper']], adv_test['sales'])))


Для sales ~ log_tv + radio, R^2 train 0.9017771
Для sales ~ TV + radio, R^2 train 0.8826436
Для sales ~ TV + radio, R^2 train 0.8721005


__________________
__________________
## Проделаем это же для датафрейма с разделением на обучение и валидацию в пропорции 70:30

In [181]:
from sklearn.model_selection import train_test_split

adv_train7030, adv_test7030 = train_test_split(adv_df, test_size=0.3, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(adv_train7030))
print("Total transactions in test dataset: ", len(adv_test7030))

Total transactions in train dataset:  140
Total transactions in test dataset:  60


In [182]:
x1_train_7030 = adv_train7030.filter(items = ['log_tv', 'radio'])
x2_train_7030 = adv_train7030.filter(items = ['TV', 'radio'])
x3_train_7030 = adv_train7030.filter(items = ['TV', 'radio','newspaper'])

y_train_7030 = adv_train7030['sales']

### Обучим модели с разными вариантами фичей и посчитаем коэффициенты b0, b1...bn

In [183]:
skm17030 = lm.LinearRegression()
# calculate parameters
skm17030.fit(x1_train_7030, y_train_7030)
# show them
skm17030.intercept_, skm17030.coef_

(0.007859476128131038, array([0.73291821, 0.58962267]))

In [184]:
skm27030 = lm.LinearRegression()
# calculate parameters
skm27030.fit(x2_train_7030, y_train_7030)
# show them
skm27030.intercept_, skm27030.coef_

(0.001879392389370682, array([0.72530138, 0.57653524]))

In [185]:
skm37030 = lm.LinearRegression()
# calculate parameters
skm37030.fit(x3_train_7030, y_train_7030)
# show them
skm37030.intercept_, skm37030.coef_

(0.00221864064072895, array([0.72500382, 0.56709307, 0.02872862]))

### Посчитаем R^2 для train

In [186]:
print('Для sales ~ log_tv + radio, R^2 train = ', skm17030.score(x1_train_7030, y_train_7030, sample_weight=None))
print('Для sales ~ TV + radio, R^2 train = ', skm27030.score(x2_train_7030, y_train_7030, sample_weight=None))
print('Для sales ~ TV + radio + newspaper, R^2 train = ', skm37030.score(x3_train_7030, y_train_7030, 
                                                                         sample_weight=None))

Для sales ~ log_tv + radio, R^2 train =  0.8993482542237392
Для sales ~ TV + radio, R^2 train =  0.9048377867980044
Для sales ~ TV + radio + newspaper, R^2 train =  0.9055159502227754


### Посчитаем R^2 для test

In [187]:
print('Для sales ~ log_tv + radio, R^2 train {:.7f}'.format(
skm17030.score(adv_test7030[['log_tv','radio']], adv_test7030['sales'])))

print('Для sales ~ TV + radio, R^2 train {:.7f}'.format(
skm27030.score(adv_test7030[['TV','radio']], adv_test7030['sales'])))

print('Для sales ~ TV + radio, R^2 train {:.7f}'.format(
skm37030.score(adv_test7030[['TV','radio','newspaper']], adv_test7030['sales'])))


Для sales ~ log_tv + radio, R^2 train 0.9143971
Для sales ~ TV + radio, R^2 train 0.8656254
Для sales ~ TV + radio, R^2 train 0.8609467


__________________
__________________
## Проделаем это же для датафрейма с разделением на обучение и валидацию в пропорции 80:20

In [188]:
from sklearn.model_selection import train_test_split

adv_train8020, adv_test8020 = train_test_split(adv_df, test_size=0.2, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(adv_train8020))
print("Total transactions in test dataset: ", len(adv_test8020))

Total transactions in train dataset:  160
Total transactions in test dataset:  40


In [189]:
x1_train_8020 = adv_train8020.filter(items = ['log_tv', 'radio'])
x2_train_8020 = adv_train8020.filter(items = ['TV', 'radio'])
x3_train_8020 = adv_train8020.filter(items = ['TV', 'radio','newspaper'])

y_train_8020 = adv_train8020['sales']

### Обучим модели с разными вариантами фичей и посчитаем коэффициенты b0, b1...bn

In [190]:
skm18020 = lm.LinearRegression()
# calculate parameters
skm18020.fit(x1_train_8020, y_train_8020)
# show them
skm18020.intercept_, skm18020.coef_

(-0.0013781425611843635, array([0.74632365, 0.58143472]))

In [191]:
skm28020 = lm.LinearRegression()
# calculate parameters
skm28020.fit(x2_train_8020, y_train_8020)
# show them
skm28020.intercept_, skm28020.coef_

(0.0035354405973982324, array([0.73601268, 0.54256331]))

In [192]:
skm38020 = lm.LinearRegression()
# calculate parameters
skm38020.fit(x3_train_8020, y_train_8020)
# show them
skm38020.intercept_, skm38020.coef_

(0.003747646728203463, array([0.73603268, 0.53837398, 0.0115254 ]))

### Посчитаем R^2 для train

In [193]:
print('Для sales ~ log_tv + radio, R^2 train = ', skm18020.score(x1_train_8020, y_train_8020, sample_weight=None))
print('Для sales ~ TV + radio, R^2 train = ', skm28020.score(x2_train_8020, y_train_8020, sample_weight=None))
print('Для sales ~ TV + radio + newspaper, R^2 train = ', skm38020.score(x3_train_8020, y_train_8020, 
                                                                         sample_weight=None))

Для sales ~ log_tv + radio, R^2 train =  0.9011051213818659
Для sales ~ TV + radio, R^2 train =  0.8955982149747163
Для sales ~ TV + radio + newspaper, R^2 train =  0.8957008271017816


### Посчитаем R^2 для test

In [194]:
print('Для sales ~ log_tv + radio, R^2 train {:.7f}'.format(
skm18020.score(adv_test8020[['log_tv','radio']], adv_test8020['sales'])))

print('Для sales ~ TV + radio, R^2 train {:.7f}'.format(
skm28020.score(adv_test8020[['TV','radio']], adv_test8020['sales'])))

print('Для sales ~ TV + radio, R^2 train {:.7f}'.format(
skm38020.score(adv_test8020[['TV','radio','newspaper']], adv_test8020['sales'])))


Для sales ~ log_tv + radio, R^2 train 0.9235658
Для sales ~ TV + radio, R^2 train 0.9005833
Для sales ~ TV + radio, R^2 train 0.8994380


### Вывод:
### Максимальную долю объясненной дисперсии R^2 мы получили, разбив множество на трейн и тест в пропорции 80:20.
### Лучшая модель: sales ~ log_tv + radio
### R^2 = 0.9235658