In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.inspection import permutation_importance

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle
from preprocessing import drop_columns, custom_mapping


In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


- Response: Таргет переменная

Столбцы:
- Gender
- Age
- Previously_Insured: была ли оформлена страховка ранее
- Vehicle_Age
- Vehicle_Damage: Повреждалось ли Т.С
- Annual_Premium: стоимость страховки, предлагаемая клиенту

In [5]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [6]:
# def custom_mapping(X):
#     X = X.copy()

#     X['Gender'] = X['Gender'].map({'Male': 1, 'Female': 0})
#     X['Vehicle_Age'] = X['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
#     X['Vehicle_Damage'] = X['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

#     return X

# def custom_policy_grouped(X):
#     X = X.copy()

#     if "Policy_Sales_Channel" not in X.columns:
#         return X

#     # Создаем новый столбец, где все редкие значения заменяются на mean
#     top_channels = X["Policy_Sales_Channel"].value_counts().nlargest(3)

#     rare_mask = ~X["Policy_Sales_Channel"].isin(top_channels)
#     rare_mean = int(X.loc[rare_mask, "Policy_Sales_Channel"].mean())

#     top_channels_list = top_channels.index.tolist()
#     X["Policy_Sales_Channel_Grouped"] = X["Policy_Sales_Channel"].apply(lambda x: x if x in top_channels_list else rare_mean)

#     X.drop(columns=['Policy_Sales_Channel'])

#     return X

# def drop_columns(X):
#     return X.drop(columns=['id', 'Region'], errors='ignore')

In [7]:
numerical_features = ["Age", 'Annual_Premium']

dropper = FunctionTransformer(drop_columns)
mapper = FunctionTransformer(custom_mapping)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
    # remainder='passthrough'
)

Включить remainder = 'passthrough', чтобы увидеть все признаки

In [8]:
# df_test = dropper.transform(df)
# df_test = mapper.transform(df_test)
# df_test = policy_groupper.transform(df_test)
# df_test = preprocessor.fit_transform(df_test)
# # Преобразуем в DataFrame
# df_test = pd.DataFrame(df_test, columns=preprocessor.get_feature_names_out())

# df_test.head()

In [9]:
# # Вычисляем корреляцию
# corr_matrix = df_test.corr()

# # Создаем тепловую карту (heatmap)
# fig = ff.create_annotated_heatmap(
#     z=corr_matrix.values,
#     x=list(corr_matrix.columns),
#     y=list(corr_matrix.index),
#     colorscale='Viridis',  # Цветовая схема
#     annotation_text=corr_matrix.round(2).values,  # Округляем до 2 знаков
#     showscale=True
# )
# # Отображаем график
# fig.show()

In [10]:
data = df.copy()

In [11]:
data = data.drop_duplicates()

In [12]:
# Разделение на признаки (X) и целевую переменную (y)
X = data.drop('Response', axis=1)
y = data['Response']

max_train_size = 5_000_000
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=max_train_size, shuffle=False, random_state=42)

In [13]:
len(X_train)

5000000

In [14]:
model_pipeline = Pipeline(steps=[
    ('dropper', dropper),
    ('mapper', mapper),
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
                      scale_pos_weight=3,
                      objective='binary:logistic',
                      eval_metric='aucpr',
                      learning_rate=0.1,
                      max_depth=3,
                      subsample=0.95,
                      n_estimators=1000,
                      random_state=42,
                      n_jobs=-1)),
])

In [15]:
model_pipeline.fit(X_train, y_train)

In [16]:
with open('model_XGB.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

# Test API

In [17]:

data = pd.read_csv('train.csv')

X = data.drop('Response', axis=1)
y = data['Response']

max_train_size = 5_000_000
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=max_train_size, shuffle=False, random_state=42)

In [43]:
import requests

def predict_model(data):
    url = 'http://127.0.0.1:5000/predict_model'
    print(data)

    # Отправка POST-запроса с данными в формате форм-данных
    response = requests.post(url, json=data)

    # Проверка статуса ответа
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Request failed with status code {response.status_code}"}


prediction = predict_model(X_test.iloc[24].to_dict())
print(prediction)
# for i in range(len(X_test)):
#     if int(predict_model(X_test.iloc[i].to_dict())['prediction'][-1]) == 1:
#         print(i)

{'id': 5000024, 'Gender': 'Male', 'Age': 61, 'Driving_License': 1, 'Region_Code': 36.0, 'Previously_Insured': 0, 'Vehicle_Age': '1-2 Year', 'Vehicle_Damage': 'Yes', 'Annual_Premium': 29063.0, 'Policy_Sales_Channel': 26.0, 'Vintage': 267}
{'prediction': 'Response = 0'}


In [None]:
import json

data = X_test.iloc[0].to_dict()
print("Отправляемые данные:", json.dumps(data, indent=2))

Отправляемые данные: {
  "id": 5000000,
  "Gender": "Female",
  "Age": 44,
  "Driving_License": 1,
  "Region_Code": 36.0,
  "Previously_Insured": 1,
  "Vehicle_Age": "1-2 Year",
  "Vehicle_Damage": "No",
  "Annual_Premium": 2630.0,
  "Policy_Sales_Channel": 152.0,
  "Vintage": 182
}


In [None]:
# Загрузка модели из файла pickle
with open('model_XGB.pkl', 'rb') as f:
    model = pickle.load(f)