In [46]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim

from pytorch_tabnet.tab_model import TabNetClassifier #нейросеть для таблиц от 10к строк до 10кк
from warnings import simplefilter

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer #модель для автозаполнения пустых ячеек

simplefilter("ignore", category=UserWarning)
data_patch = '.\\'

In [6]:
def preprocessing_data(filename, dummies=False, polynomial=False, test=False):
    
    dataframe = pd.read_csv(filename).copy()
    if test:
        dataframe['PassengerId'].to_csv('gender_submission.csv', sep=',', index=False)
    else:
        survived = dataframe['Survived']
        
    dataframe = dataframe.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
    #mask = dataframe['Age'].isna()
    #dataframe.loc[mask, 'Age'] = np.random.randint(10, 50, mask.sum())
    dataframe['Embarked'] = dataframe['Embarked'].fillna(dataframe['Embarked'].mode()[0])
    dataframe['Sex'] = dataframe['Sex'].map({'male':1,'female':0})
    dataframe['Embarked'] = dataframe['Embarked'].map({'S':0,'C':1, 'Q':2})
    dataframe['Name'] = dataframe['Name'].apply(lambda x: re.search(r', \w*', x)[0][2:])
    replacements = {
        'Master': 1,
        'Mrs': 2,
        'Mr': 4,
        'Miss': 3,
        'Dr': 4,
        'Rev': 4,
        'Major': 4,
        'Col': 4,
        'Capt': 4,
        'Sir': 4,
        'Don': 4,
        'Jonkheer': 4,
        'Mlle': 3,
        'the': 3,
        'Ms': 3,
        'Lady': 3,
        'Mme': 3
    }
    
    dataframe['Name'] = dataframe['Name'].replace(replacements)
    
    a = dataframe['Name'].unique()
    b = [1, 2, 3, 4]
    
    dataframe['Name'] = dataframe['Name'].replace(list(set(a).difference(b)), dataframe['Name'].mode()[0])
    
    imputer = IterativeImputer()
    imputer.fit(dataframe)
    dataframe = imputer.transform(dataframe)
    
    if test:
        dataframe = pd.DataFrame(data=dataframe, columns=['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
    else:
        dataframe = pd.DataFrame(data=dataframe, columns=['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
        
    dataframe[['Pclass', 'Sex', 'Embarked', 'SibSp', 'Parch']] = dataframe[['Pclass', 'Sex', 'Embarked', 'SibSp', 'Parch']].astype(int)
    
    features = dataframe[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    
    if dummies:
        features = pd.get_dummies(features, columns=['Pclass', 'SibSp', 'Parch', 'Embarked'], dtype=int)
    
    if polynomial:
        poly = PolynomialFeatures(degree=2, include_bias=False)
        features = poly.fit_transform(features)
                   
    if test==False:
        X_train, X_val, y_train, y_val = train_test_split(features, survived, test_size=0.2, random_state=2, stratify=survived)
        
        # rob = RobustScaler().fit(X_train)
        # X_train = rob.transform(X_train)
        # X_val = rob.transform(X_val)
        
        return X_train, X_val, y_train, y_val
    else:
        X_train = features
        #X_train = RobustScaler().fit_transform(features)
        
        return X_train

In [4]:
def tab_models_score(name_model, model, plot_score, all_data=False, test=False):
               
    if all_data:
        X_train, X_val, y_train, y_val = preprocessing_data(data_patch + 'train.csv', dummies=True, polynomial=True)
    else:
        X_train, X_val, y_train, y_val = preprocessing_data(data_patch + 'train.csv')
        
    model.fit(X_train, y_train)
    train_list = classification_report(y_train, model.predict(X_train))
    val_list = classification_report(y_val, model.predict(X_val))
    score_train = train_list.split(' ')[128]
    score_val = val_list.split(' ')[129]
    plot_score.loc[len(plot_score)] = [str(name_model), score_train, score_val]
    
    if all_data:
        print("Наилучшие значения параметров: {}".format(model.best_params_))
        print("Наилучшее значение кросс-валидац. правильности: {:.2f}".format(model.best_score_))

In [7]:
plot_score_1 = pd.DataFrame(columns=['model', 'train', 'val'])
plot_score_2 = pd.DataFrame(columns=['model', 'train', 'val'])

In [8]:
pd.read_csv(data_patch + 'train.csv').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [45]:
tabnet = TabNetClassifier(
    n_d=8,  # Dimension of the decision prediction layer
    n_a=8,  # Dimension of the attention layer
    n_steps=3,  # Number of decision steps
    gamma=1.3,  # Relaxation factor for feature re-use
    lambda_sparse=0.001,  # Sparsity regularisation coefficient
    optimizer_fn=optim.Adam,  # Optimisation algorithm
    optimizer_params=dict(lr=2e-2),  # Learning rate for the optimizer
    mask_type='sparsemax',  # Feature selection mask type (can also use 'entmax')
    device_name='cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available, otherwise CPU
)

X_train, X_val, y_train, y_val = preprocessing_data(data_patch + 'train.csv')
X_learn, X_valid, y_learn, y_valid = train_test_split(X_train, y_train, random_state=2)
print(X_learn.values.shape)
print(np.array(y_learn).reshape(-1,1).shape)

tabnet.fit(X_learn.values, y_learn.values.reshape(-1,1), eval_set=[(X_valid.values, y_valid.values.reshape(-1,1))])

(534, 8)
(534, 1)


ValueError: Data must be 1-dimensional, got ndarray of shape (534, 1) instead

In [18]:
tabnet.fit(X_train.values, y_train, max_epochs=50)
train_list = classification_report(y_train, tabnet.predict(X_train))
val_list = classification_report(y_val, tabnet.predict(X_val))
score_train = train_list.split(' ')[128]
score_val = val_list.split(' ')[129]
plot_score.loc[len(plot_score)] = ['TabNetClassifier', score_train, score_val]

epoch 0  | loss: 0.0     |  0:00:00s
epoch 1  | loss: 0.0     |  0:00:00s
epoch 2  | loss: 0.0     |  0:00:00s
epoch 3  | loss: 0.0     |  0:00:00s
epoch 4  | loss: 0.0     |  0:00:00s
epoch 5  | loss: 0.0     |  0:00:00s
epoch 6  | loss: 0.0     |  0:00:00s
epoch 7  | loss: 0.0     |  0:00:00s
epoch 8  | loss: 0.0     |  0:00:00s
epoch 9  | loss: 0.0     |  0:00:00s
epoch 10 | loss: 0.0     |  0:00:00s
epoch 11 | loss: 0.0     |  0:00:00s
epoch 12 | loss: 0.0     |  0:00:00s
epoch 13 | loss: 0.0     |  0:00:00s
epoch 14 | loss: 0.0     |  0:00:00s
epoch 15 | loss: 0.0     |  0:00:00s
epoch 16 | loss: 0.0     |  0:00:00s
epoch 17 | loss: 0.0     |  0:00:00s
epoch 18 | loss: 0.0     |  0:00:00s
epoch 19 | loss: 0.0     |  0:00:00s
epoch 20 | loss: 0.0     |  0:00:00s
epoch 21 | loss: 0.0     |  0:00:00s
epoch 22 | loss: 0.0     |  0:00:00s
epoch 23 | loss: 0.0     |  0:00:00s
epoch 24 | loss: 0.0     |  0:00:00s
epoch 25 | loss: 0.0     |  0:00:00s
epoch 26 | loss: 0.0     |  0:00:00s
e

KeyError: 0

In [20]:
y_pred = tabnet.predict(X_test)
print(accuracy_score(y_test, y_pred))

NameError: name 'X_test' is not defined