### Подготавливаем данные 

In [35]:
import numpy as np
import pandas as pd

data = pd.read_csv('Shanghai_HMT_2010.csv')
numeric_data = data.drop(['cbwd'], axis=1)
data = data.dropna()
data = (data - data.mean()) / data.std()
# Проверим данные
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21436 entries, 26304 to 52582
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   DEWP           21436 non-null  float64
 1   HUMI           21436 non-null  float64
 2   Iprec          21436 non-null  float64
 3   Iws            21436 non-null  float64
 4   No             21436 non-null  float64
 5   PM_Jingan      21436 non-null  float64
 6   PM_US Post     21436 non-null  float64
 7   PM_Xuhui       21436 non-null  float64
 8   PRES           21436 non-null  float64
 9   TEMP           21436 non-null  float64
 10  cbwd           0 non-null      object 
 11  day            21436 non-null  float64
 12  hour           21436 non-null  float64
 13  month          21436 non-null  float64
 14  precipitation  21436 non-null  float64
 15  season         21436 non-null  float64
 16  year           21436 non-null  float64
dtypes: float64(16), object(1)
memory usage: 2.9+ M

  data = (data - data.mean()) / data.std()


Находим медиану по давлению и обновляем соотв поле как категориальный признак

In [36]:
pres_median = data['PRES'].median()
data = data.dropna()
data['PRES'] = (data['PRES'] > pres_median).astype('int64')

### Используем sklearn

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, roc_auc_score

X = data.drop(['PRES'], axis=1)
y = data['PRES']

# установим random_state, чтобы результаты эксперимента были воспроизводимы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = LogisticRegression(max_iter=7600)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

err = sum(predictions == y_test) / len(y_test)  # accuracy

print("accuracy =", err)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Используем кастомный класс

In [39]:
import torch as T
from custom_lin_reg import CustomLogisticRegression

X = data.drop(['PRES'], axis=1)
y = data['PRES']

# установим random_state, чтобы результаты эксперимента были воспроизводимы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

device = 'cpu'

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

X_train = T.tensor(X_train, dtype=T.float32).to(device)
y_train = T.tensor(y_train, dtype=T.long).to(device)
X_test = T.tensor(X_test, dtype=T.float32).to(device)
y_test = T.tensor(y_test, dtype=T.long).to(device)

num_of_features = X_train.size()[1]


lr = CustomLogisticRegression(device, num_of_features)
lrn_rate = 0.0001
indices = np.arange(len(X_train))

w, b = lr.fit(
    X_train,
    y_train,
    lrn_rate,
    indices,
    100,
    0)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
predictions = lr.predict(X_test)
err = mean_squared_error(predictions, y_test)
print("mean_squared_error =", err)