# Логистическая регрессия

In [1]:
import numpy as np
import pandas as pd

In [11]:
from sklearn.metrics import f1_score, classification_report

### Загружаем данные

In [2]:
df = pd.read_csv('Processed_NYSE.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0_level_0,Close,Volume,mom,mom1,mom2,mom3,ROC_5,ROC_10,ROC_15,ROC_20,...,NZD,silver-F,RUSSELL-F,S&P-F,CHF,Dollar index-F,Dollar index,wheat-F,XAG,XAU
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2009-12-31,7184.959961,,,,,,,,,,...,0.03,0.26,-1.08,-1.0,-0.11,-0.08,-0.06,-0.48,0.3,0.39
2010-01-04,7326.740234,0.921723,0.019733,,,,,,,,...,1.52,3.26,1.61,1.62,-0.57,-0.59,-0.42,3.12,3.91,2.1
2010-01-05,7354.870117,-0.375903,0.003839,0.019733,,,,,,,...,-0.07,1.96,-0.2,0.31,0.43,0.03,0.12,-0.9,1.42,-0.12
2010-01-06,7377.700195,0.996234,0.003104,0.003839,0.019733,,,,,,...,0.56,2.15,-0.02,0.07,-0.56,-0.24,-0.17,2.62,2.25,1.77
2010-01-07,7393.930176,0.059932,0.0022,0.003104,0.003839,0.019733,,,,,...,-0.72,0.94,0.5,0.4,0.58,0.58,0.54,-1.85,0.22,-0.58


Для задачи бинарной классификации Target-переменная получается из Close:

$$\mathrm{target}_t = 
\begin{cases}
  1, & \mathrm{close}_{t+1} > \mathrm{close}_{t}\\    
  0, & \mathrm{otherwise}   
\end{cases}$$

Целевая переменная:

In [4]:
target = (np.array(df[21:].Close) - np.array(df[20:-1].Close) > 0).astype(int)

In [5]:
df = df[20:-1]
df = df.drop(['Close', 'Name'], axis=1)
df['target'] = target

Заполняем пропуски средним:

In [6]:
df = df.fillna(df.mean())

---

### Обучение

In [7]:
y = df['target']
X = df.drop(['target'], axis=1)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                                                    random_state=42, stratify=y)

In [14]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import StratifiedKFold, GridSearchCV

logreg_model = LogisticRegression()

params = {
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'C': np.arange(0.1, 10, 0.1)
         }

skf = StratifiedKFold(n_splits=3, shuffle = True, random_state=42)


grid = GridSearchCV(logreg_model, 
                    param_grid = params,  
                    n_jobs = -1, 
                    cv = skf.split(X_train, y_train),
                    scoring='f1_macro')

In [15]:
grid.fit(X_train, y_train)
best_params = grid.best_params_
best_params

{'C': 3.9000000000000004, 'penalty': 'l2', 'solver': 'liblinear'}

In [16]:
logreg = LogisticRegression(solver='liblinear',
                            penalty='l2',
                            C=3.9,
                            max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.25      0.33       303
           1       0.53      0.75      0.62       345

    accuracy                           0.52       648
   macro avg       0.50      0.50      0.47       648
weighted avg       0.50      0.52      0.48       648



------

In [112]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import StratifiedKFold, GridSearchCV

logreg_model = LogisticRegression()

params = {
            'penalty': ['l2'],
            'solver': ['liblinear', 'lbfgs'],
            'C': np.arange(0.001, 10, 0.05)
         }

skf = StratifiedKFold(n_splits=3, shuffle = True, random_state=42)


grid = GridSearchCV(logreg_model, 
                    param_grid = params,  
                    n_jobs = -1, 
                    cv = skf.split(X_train, y_train),
                    scoring='f1_macro')

In [113]:
grid.fit(X_train, y_train)
best_params = grid.best_params_
best_params

{'C': 8.251, 'penalty': 'l2', 'solver': 'liblinear'}

In [18]:
logreg = LogisticRegression(penalty='l2',
                            solver='liblinear',
                            C=8.25)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.20      0.28       303
           1       0.53      0.78      0.63       345

    accuracy                           0.51       648
   macro avg       0.49      0.49      0.45       648
weighted avg       0.49      0.51      0.47       648



--------

In [121]:

logreg_model = LogisticRegression(max_iter=1000)

params = {
            'penalty': ['l2'],
            'solver': ['liblinear', 'lbfgs'],
            'C': np.arange(0.001, 10, 0.05)
         }

skf = StratifiedKFold(n_splits=3, shuffle = True, random_state=42)


grid = GridSearchCV(logreg_model, 
                    param_grid = params,  
                    n_jobs = -1, 
                    cv = skf.split(X_train, y_train),
                    scoring='accuracy')

In [122]:
grid.fit(X_train, y_train)
best_params = grid.best_params_
best_params

{'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}

In [19]:
logreg = LogisticRegression(penalty='l2',
                            solver='lbfgs',
                            C=0.001,
                            max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.13      0.21       303
           1       0.54      0.88      0.67       345

    accuracy                           0.53       648
   macro avg       0.51      0.51      0.44       648
weighted avg       0.51      0.53      0.45       648



In [17]:
logreg = LogisticRegression(solver='lbfgs',
                            penalty='l2',
                            C=0.5,
                            max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.17      0.26       303
           1       0.54      0.86      0.66       345

    accuracy                           0.54       648
   macro avg       0.53      0.51      0.46       648
weighted avg       0.53      0.54      0.48       648



----