In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [28]:
df_train = pd.read_csv('/Users/kirill/Downloads/train 2.csv')
df_test = pd.read_csv('/Users/kirill/Downloads/test.csv')
sample_submission = pd.read_csv('/Users/kirill/Downloads/sample_submission.csv')

# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0  # помечаем где у нас тест
df_test['default'] = 0 # в тесте у нас нет значения default, мы его должны предсказать, по этому пока просто заполняем нулями

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем

data.education.fillna('SCH', inplace=True)

num_cols = ['age', 'score_bki', 'decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income']
cat_cols = ['education', 'first_time', 'sna', 'work_address', 'home_address', 'region_rating']
bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']

'''# значимость числовых признаков
from sklearn.feature_selection import f_classif
imp_num = pd.Series(f_classif(data[num_cols], data['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

# значимость категориальных признаков
from sklearn.feature_selection import mutual_info_classif
imp_cat = pd.Series(mutual_info_classif(data[bin_cols + cat_cols], data['default'],
                                     discrete_features =True), index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')'''

# преобразование категориальных признаков
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column in bin_cols:
    data[column] = le.fit_transform(data[column])   
for column in cat_cols:
    data[column] = le.fit_transform(data[column])    
for column in ['education', 'region_rating', 'home_address', 'sna']:
    data = pd.get_dummies(data, columns = [column])

# из коррелирующих признаков удаляем work_address и first_time, а так же удаляем ненужные client_id и app_date
data.drop(['client_id','app_date', 'work_address', 'first_time'], axis = 1, inplace=True)

# логарифмируем признак age
data['age'] = np.log(data['age'] + 1)

# логарифмируем признак decline_app_cnt
data['decline_app_cnt'] = np.log(data['decline_app_cnt'] + 1)

# логарифмируем признак income
data['income'] = np.log(data['income'] + 1)

# логарифмируем признак bki_request_cnt
data['bki_request_cnt'] = np.log(data.bki_request_cnt + 1)

In [38]:
# Теперь выделим тестовую часть
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

y = train_data['default'].values  # наш таргет
X = train_data.drop(['default'], axis=1)

#Устраняем дисбаланс классов, что заметно улучшает метрику
SM = SMOTE(sampling_strategy=1, random_state=42)
X_balanced, y_balanced = SM.fit_resample(X, y)

X_train, X_valid, y_train, y_valid = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

RS = RobustScaler()

# Импортируем необходимые библиотеки:
from sklearn.linear_model import LogisticRegression # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

logreg = LogisticRegression(
C=1.0
,class_weight='balanced'
,dual=False
,fit_intercept=True
,intercept_scaling=1
,l1_ratio=None
,max_iter=1000
,multi_class='auto'
,n_jobs=None
,penalty='l2'
,random_state=None
,solver='sag'
,tol=0.001
,verbose=0
,warm_start=False)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_valid)

from sklearn.metrics import classification_report
classification_report = classification_report(y_valid, y_pred)
print(classification_report)
print('ROC-AUC=',roc_auc_score(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.69      0.69     12877
           1       0.69      0.70      0.70     12894

    accuracy                           0.70     25771
   macro avg       0.70      0.70      0.70     25771
weighted avg       0.70      0.70      0.70     25771

ROC= 0.6954702930215668


In [37]:
#подбираем гиперпараметры. Однако метрика не растет.
from sklearn.model_selection import GridSearchCV

iter_ = 1000
epsilon_stop = 1e-3

param_grid = [
    {'penalty': ['l1'], 
     'solver': ['liblinear', 'lbfgs'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop]},
    {'penalty': ['l2'], 
     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop]},
    {'penalty': ['none'], 
     'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop]},
]

#model ваша модель логистической регрессии
gridsearch = GridSearchCV(logreg, param_grid, scoring='f1', n_jobs=-1, cv=5)
gridsearch.fit(X_train_balanced, y_train_balanced)
model = gridsearch.best_estimator_

#печатаем параметры
best_parameters = model.get_params()
for param_name in sorted(best_parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))

40 fits failed out of a total of 220.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kirill/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kirill/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1528, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/Users/kirill/opt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_base.py", line 1150, in _fit_liblinear
    class_weight_ = compute_class_weight(class_weight, classes=classes_, y=y)
  File "/Users/kirill/opt/anaco

	C: 1.0
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 1000
	multi_class: 'auto'
	n_jobs: None
	penalty: 'l2'
	random_state: None
	solver: 'sag'
	tol: 0.001
	verbose: 0
	warm_start: False


In [None]:
#Немного не в том порядке в этой ячейке визуализируем данные. Видим, что нужно прологарифмировать числовые признаки.
from pandas_profiling import ProfileReport
ProfileReport(data)

In [43]:
#теперь заполним тестовую выборку
sample_submission['default'] = logreg.predict(test_data)

In [45]:
sample_submission.to_csv("sample_submission_result.csv", index=False)