1.1 Подключение библиотек

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
from sklearn.model_selection import train_test_split

In [2]:
uploaded = files.upload()

Saving nvda_data.csv to nvda_data.csv


Индивидуальное задание

In [3]:
# Чтение данных
dataset = pd.read_csv('nvda_data.csv')
print("Данные:")
print(dataset.head())

Данные:
              Datetime        Open        High         Low       Close  \
0  2024-08-23 09:30:00  125.870003  125.970001  125.529999  125.605103   
1  2024-08-23 09:31:00  125.610001  125.730003  125.220001  125.620003   
2  2024-08-23 09:32:00  125.639999  125.790001  125.400002  125.519997   
3  2024-08-23 09:33:00  125.540001  125.730003  125.349998  125.459999   
4  2024-08-23 09:34:00  125.459999  125.849998  125.419998  125.705002   

    Adj Close   Volume  
0  125.605103  9116047  
1  125.620003  1619948  
2  125.519997  1213367  
3  125.459999  1388527  
4  125.705002  1201631  


In [4]:
# Удаляем пропущенные значения
dataset = dataset.dropna()

In [5]:
# Определяем матрицу признаков (X) и целевую переменную (y)
# Признаки: 'Open', 'High', 'Low', 'Volume'
# Целевая переменная: 'Close'
X = dataset[['Open', 'High', 'Low', 'Volume']].values
y = dataset['Close'].values

In [6]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
from sklearn.linear_model import LinearRegression
# Обучение линейной регрессии
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [8]:
# Предсказание на тестовой выборке
y_pred = regressor.predict(X_test)
print("Предсказания:")
print(y_pred[:5])

Предсказания:
[109.04412606 115.19867121 119.0157642  116.37569776 106.16818356]


In [9]:
# Оптимизация модели с помощью backward elimination
# Добавляем столбец единиц для константы
X = np.append(arr=np.ones((X.shape[0], 1)).astype(int), values=X, axis=1)

In [10]:
import statsmodels.api as sm
# Начальная модель со всеми признаками
X_opt = X[:, [0, 1, 2, 3, 4]]  # Индексы признаков
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
print("Модель с полным набором признаков:")
regressor_OLS.summary()
# OLS (Ordinary Least Squares) используется для оценки важности признаков.
# sm.OLS создает модель, которая рассчитывает статистические параметры.
# endog=y — зависимая переменная, exog=X_opt — матрица независимых переменных.-

Модель с полным набором признаков:


0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,31390000.0
Date:,"Thu, 05 Dec 2024",Prob (F-statistic):,0.0
Time:,06:13:28,Log-Likelihood:,13106.0
No. Observations:,9746,AIC:,-26200.0
Df Residuals:,9741,BIC:,-26170.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0372,0.010,-3.542,0.000,-0.058,-0.017
x1,-0.5225,0.008,-67.314,0.000,-0.538,-0.507
x2,0.7770,0.006,121.969,0.000,0.765,0.789
x3,0.7458,0.006,125.983,0.000,0.734,0.757
x4,9.609e-10,6.92e-10,1.389,0.165,-3.95e-10,2.32e-09

0,1,2,3
Omnibus:,2072.982,Durbin-Watson:,1.974
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52455.537
Skew:,-0.406,Prob(JB):,0.0
Kurtosis:,14.336,Cond. No.,20400000.0



**AIC/BIC: характеризуют баланс между сложностью и качеством модели (меньше = лучше).**

**Cond. No.: указывает на наличие мультиколлинеарности (высокое значение = плохо).**

**p-значение: оценивает значимость отдельного признака (p>0.05: признак незначим).**

**R^2 и Adj. R^2: оценивают, насколько хорошо модель описывает данные (ближе к 1 = лучше).**

In [11]:
# Удаляем признак с наибольшим p-значением и повторяем
X_opt = X[:, [0, 1, 2, 3]]  # Удален Volume
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
print("Модель после первого шага:")
regressor_OLS.summary()

Модель после первого шага:


0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,41850000.0
Date:,"Thu, 05 Dec 2024",Prob (F-statistic):,0.0
Time:,06:13:30,Log-Likelihood:,13105.0
No. Observations:,9746,AIC:,-26200.0
Df Residuals:,9742,BIC:,-26170.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0366,0.010,-3.487,0.000,-0.057,-0.016
x1,-0.5224,0.008,-67.300,0.000,-0.538,-0.507
x2,0.7792,0.006,126.405,0.000,0.767,0.791
x3,0.7434,0.006,131.070,0.000,0.732,0.755

0,1,2,3
Omnibus:,2074.019,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52681.987
Skew:,-0.405,Prob(JB):,0.0
Kurtosis:,14.361,Cond. No.,3340.0


In [12]:
# Еще одно удаление
X_opt = X[:, [0, 1, 2, 4]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
print("Модель после второго шага:")
regressor_OLS.summary()

Модель после второго шага:


0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,15920000.0
Date:,"Thu, 05 Dec 2024",Prob (F-statistic):,0.0
Time:,06:13:31,Log-Likelihood:,8394.7
No. Observations:,9746,AIC:,-16780.0
Df Residuals:,9742,BIC:,-16750.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0621,0.017,-3.647,0.000,-0.095,-0.029
x1,0.0595,0.010,5.885,0.000,0.040,0.079
x2,0.9404,0.010,92.991,0.000,0.921,0.960
x3,-2.4e-08,1.07e-09,-22.331,0.000,-2.61e-08,-2.19e-08

0,1,2,3
Omnibus:,5726.876,Durbin-Watson:,1.65
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81034.587
Skew:,-2.559,Prob(JB):,0.0
Kurtosis:,16.167,Cond. No.,20400000.0
