In [1]:
import warnings
warnings.filterwarnings('ignore')

## Numpy

Numpy - библиотека python, позволяющая делать сложные вычисления и операции с массивами.

Главный объект Numpy - это однородный многомерный массив. 

Массив - упорядоченный набор однородных данных, доступных по индексу. Чаще всего это одномерная последовательность или двумерная таблица, заполненные элементами одного типа.

[Подробнее](https://pyprog.pro/introduction.html) о массивах

In [2]:
import numpy as np

In [4]:
a = np.array([1, 3, 2])

a

array([1, 3, 2])

In [5]:
a = np.array([[1.5, 2, 3], [4, 5, 6]], dtype=np.complex128) #В отличии от списков, в массиве все элементы обязательно принадлежат одному типу

a

array([[1.5+0.j, 2. +0.j, 3. +0.j],
       [4. +0.j, 5. +0.j, 6. +0.j]])

In [6]:
a = np.array([[1.5, 2, 3], [4, 5, 600000]], dtype=np.int16) #создаст одномерную матрицу размером 2 х 3 c типом данных int16 

a

#Массив будет преобразован в соответствии с данным типом и на выходе полчим следующее:
#Дробные числа переведены в целочисленный формат в соответствии с типом
#Число 600000 не умещается в тип int16, происходит "переполнение типа"


array([[    1,     2,     3],
       [    4,     5, 10176]], dtype=int16)

## Сложение / Вычитание матриц

In [7]:
### Матрица A размера 2x3
A = np.array([[3, 4, 5],
              [1, 1, 1]])

### Матрица B размера 2x3
B = np.array([[5, 5],
              [3, 1]])

### Так как размеры совпадают, можно вычитать и складывать!

In [8]:
A - B 

ValueError: operands could not be broadcast together with shapes (2,3) (2,2) 

## Умножение матриц

In [9]:
### Матрица A размера 2x3
A = np.array([[3, 4, 5],
              [1, 1, 1]])

### Матрица B размера 3x3
B = np.array([[5, 5, 5],
              [3, 1, 10],
              [4, 4, 12]])

### Так как кол-во строк матрицы A совпадает с кол-вом строк матрицы B,
### Можем умножить A * B

In [10]:
np.dot(B, A)

ValueError: shapes (3,3) and (2,3) not aligned: 3 (dim 1) != 2 (dim 0)

## Транспонирование матриц

In [11]:
A = np.array([[3, 4, 5],
              [1, 1, 1]])

In [12]:
A.T

array([[3, 1],
       [4, 1],
       [5, 1]])

## Единичная матрица

In [13]:
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

## Обращение матриц

In [14]:
### Матрица A размера 3x3 => Ее можно обращать
A = np.array([[5, 5, 5],
              [3, 2, 10],
              [4, 4, 12]])

In [15]:
np.linalg.inv(A)

array([[ 4.00000000e-01,  1.00000000e+00, -1.00000000e+00],
       [-1.00000000e-01, -1.00000000e+00,  8.75000000e-01],
       [-1.00000000e-01, -2.77555756e-17,  1.25000000e-01]])

## $\beta^* = (X^T \cdot X)^{-1}\cdot X^T \cdot Y$

In [16]:
X = np.array([[23, 0.5, 1],
              [35, 1, 1],
              [18, 0, 1]])

Y = np.array([55, 100, 45])


In [17]:
xxt = np.dot(X.T, X)
xxt_inv = np.linalg.inv(xxt)
xxt_inv_xxt = np.dot(xxt_inv, X.T)
final_betas = np.dot(xxt_inv_xxt, Y)

final_betas

array([  5., -30., -45.])

## Пример с первого практического занятия

In [46]:
import pandas as pd

X = pd.read_csv('X.csv').drop('Предсказание', axis=1)
Y = pd.read_csv('Y.csv')

In [47]:
X.head()

Unnamed: 0,Категория,Цель в долларах,Срок,Год публикации,Close_brent,CAD,CHF,DKK,EUR,GBP,...,Design,Fashion,Film & Video,Food,Journalism,Music,Photography,Publishing,Technology,Theater
0,6035.989239,1000.0,39,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3591.033473,80000.0,87,2009,34.41,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3661.42455,20.0,8,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4321.245721,99.0,79,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,6035.989239,1900.0,28,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [48]:
Y.head()

Unnamed: 0,таргет2
0,625.0
1,22.0
2,35.0
3,145.0
4,387.0


In [49]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, Y)


for column, coef in zip(X.columns, model.coef_):
    print(column, coef)
    
print(model.intercept_)

Категория 0.9912323981302028
Цель в долларах 0.00046210545732411
Срок 84.7960380387574
Год публикации 789.623759578069
Close_brent 32.52363238751121
CAD 477.0442389583962
CHF 8787.42593334129
DKK 519.7524602299042
EUR 408.436698789295
GBP 2794.5137779940305
HKD -2352.9318103890287
JPY -8944.518338941747
MXN -4246.631441116689
NOK -1181.5007050675156
NZD -395.7721892123315
SEK 2085.2689875865517
SGD -1612.7741094312478
USD 7000.650258232897
Art -105.3700874282956
Comics -531.3510093663201
Crafts -608.7810292957257
Dance -379.446410488763
Design -219.11835083464044
Fashion -293.370137548206
Film & Video -49.87838409990245
Food -692.236001794821
Journalism -451.5810021298357
Music -303.9350860455762
Photography 222.96861493755802
Publishing -370.53746498403916
Technology -175.9808894795485
Theater 237.6083627399823
[-1600008.97574461]


In [50]:
X['constant'] = 1

In [51]:
xxt = np.dot(X.T, X)
xxt_inv = np.linalg.inv(xxt)
xxt_inv_xxt = np.dot(xxt_inv, X.T)
final_betas = np.dot(xxt_inv_xxt, Y)

final_betas

array([[ 9.91232399e-01],
       [ 4.62105457e-04],
       [ 8.47960380e+01],
       [ 7.89623760e+02],
       [ 3.25236324e+01],
       [ 4.77044239e+02],
       [ 8.78742593e+03],
       [ 5.19752460e+02],
       [ 4.08436699e+02],
       [ 2.79451378e+03],
       [-2.35293181e+03],
       [-8.94451834e+03],
       [-4.24663144e+03],
       [-1.18150071e+03],
       [-3.95772189e+02],
       [ 2.08526899e+03],
       [-1.61277411e+03],
       [ 7.00065026e+03],
       [-1.05370087e+02],
       [-5.31351009e+02],
       [-6.08781029e+02],
       [-3.79446410e+02],
       [-2.19118351e+02],
       [-2.93370138e+02],
       [-4.98783841e+01],
       [-6.92236002e+02],
       [-4.51581002e+02],
       [-3.03935086e+02],
       [ 2.22968615e+02],
       [-3.70537465e+02],
       [-1.75980889e+02],
       [ 2.37608363e+02],
       [-1.60000898e+06]])

In [52]:
for column, coef in zip(X.columns, final_betas):
    print(column, coef)

Категория [0.9912324]
Цель в долларах [0.00046211]
Срок [84.796038]
Год публикации [789.62375955]
Close_brent [32.52363239]
CAD [477.04423896]
CHF [8787.42593335]
DKK [519.75246024]
EUR [408.4366988]
GBP [2794.513778]
HKD [-2352.9318104]
JPY [-8944.51833898]
MXN [-4246.63144111]
NOK [-1181.50070505]
NZD [-395.77218921]
SEK [2085.2689876]
SGD [-1612.77410944]
USD [7000.65025821]
Art [-105.37008743]
Comics [-531.35100937]
Crafts [-608.78102928]
Dance [-379.4464105]
Design [-219.11835083]
Fashion [-293.37013754]
Film & Video [-49.87838412]
Food [-692.23600179]
Journalism [-451.58100212]
Music [-303.93508606]
Photography [222.96861493]
Publishing [-370.53746499]
Technology [-175.98088947]
Theater [237.60836272]
constant [-1600008.97568207]


## А что, если данных, в т.ч. признаков, очень много?

In [53]:
matrix = np.random.rand(50000, 50000)

In [None]:
matrixdot = np.dot(matrix.T, matrix)
matrix_inv = np.linalg.inv(matrixdot)

### Hometask

In [4]:
import pandas as pd
import numpy as np

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,store_and_fwd_flag,trip_duration,distance_km,prediction_1,prediction_2
0,id2875421,1,2016-03-14 17:24:55,930.399753,0,455.0,1.500479,578.156451,355.27071
1,id2377394,0,2016-06-12 00:43:35,930.399753,0,663.0,1.807119,962.657188,674.295781
2,id3858529,1,2016-01-19 11:35:24,930.399753,0,2124.0,6.39208,2546.180515,2422.132431


In [35]:
df = pd.read_csv('../data/taxi_dataset_with_target.csv', index_col='id')
df.head(3)

Unnamed: 0_level_0,vendor_id,pickup_datetime,passenger_count,store_and_fwd_flag,trip_duration,distance_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id2875421,1,2016-03-14 17:24:55,930.399753,0,455.0,1.500479
id2377394,0,2016-06-12 00:43:35,930.399753,0,663.0,1.807119
id3858529,1,2016-01-19 11:35:24,930.399753,0,2124.0,6.39208


In [24]:
df['passenger_count'].value_counts()

930.399753     1033540
1005.458335     210318
1070.232174      78088
1028.236276      59896
1061.355223      48333
1053.529749      28404
1718.433333         60
19.666667            3
560.000000           1
104.000000           1
Name: passenger_count, dtype: int64

### Statsmodels usage

In [8]:
from statsmodels.formula.api import ols

In [46]:
model = ols('trip_duration ~ vendor_id + passenger_count + store_and_fwd_flag  + distance_km', data=df).fit()

In [47]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          trip_duration   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     3453.
Date:                Fri, 07 Apr 2023   Prob (F-statistic):               0.00
Time:                        23:29:43   Log-Likelihood:            -1.4554e+07
No. Observations:             1458644   AIC:                         2.911e+07
Df Residuals:                 1458639   BIC:                         2.911e+07
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept            171.6566     88

In [48]:
coefs = model.params.values
[round(x, 3) for x in coefs]

[171.657, 198.463, 0.296, 56.469, 115.274]

In [51]:
model.params

Intercept             171.656581
vendor_id             198.463137
passenger_count         0.296313
store_and_fwd_flag     56.469122
distance_km           115.273538
dtype: float64

### Sklearn usage

In [29]:
factors = ['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'distance_km']
Y = ['trip_duration']

In [30]:
df[Y]

Unnamed: 0,trip_duration
0,455.0
1,663.0
2,2124.0
3,429.0
4,435.0
...,...
1458639,778.0
1458640,655.0
1458641,764.0
1458642,373.0


In [31]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(df[factors], df[Y])

In [32]:
model.intercept_

array([171.65658146])

In [33]:
model.coef_

array([[198.46313674,   0.29631295,  56.46912165, 115.27353763]])

### Hometask - matrix

In [52]:
factors = ['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'distance_km']
Y = ['trip_duration']

X = df[factors]
Y = df[Y]

In [55]:
X['constant'] = 1


In [58]:
xxt = np.dot(X.T, X)

In [59]:
xxt_inv = np.linalg.inv(xxt)

In [62]:
xxt_inv_xxt = np.dot(xxt_inv, X.T)
final_betas = np.dot(xxt_inv_xxt, Y)

In [63]:
final_betas

array([[198.46313674],
       [  0.29631295],
       [ 56.46912165],
       [115.27353763],
       [171.65658146]])

In [None]:

# xxt_inv = np.linalg.inv(xxt)
# xxt_inv_xxt = np.dot(xxt_inv, X.T)
# final_betas = np.dot(xxt_inv_xxt, Y)

# final_betas