In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.metrics import classification_report

In [42]:
data = pd.read_csv("./Data/audit_data.csv")

In [43]:
data.head(3)

Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,...,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
0,3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,...,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
1,3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,...,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
2,3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,...,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sector_score    776 non-null    float64
 1   LOCATION_ID     776 non-null    object 
 2   PARA_A          776 non-null    float64
 3   Score_A         776 non-null    float64
 4   Risk_A          776 non-null    float64
 5   PARA_B          776 non-null    float64
 6   Score_B         776 non-null    float64
 7   Risk_B          776 non-null    float64
 8   TOTAL           776 non-null    float64
 9   numbers         776 non-null    float64
 10  Score_B.1       776 non-null    float64
 11  Risk_C          776 non-null    float64
 12  Money_Value     775 non-null    float64
 13  Score_MV        776 non-null    float64
 14  Risk_D          776 non-null    float64
 15  District_Loss   776 non-null    int64  
 16  PROB            776 non-null    float64
 17  RiSk_E          776 non-null    flo

Всего есть 25 признаков и 1 целевая переменная (бинарная) - нужно определить является ли организация мошеннической или нет

In [45]:
print(data.shape)

(776, 27)


###### Всего 776 организаций

### Обработка данных

In [46]:
# Посмотрим на соотношение классов
data.iloc[:, -1].value_counts()

0    471
1    305
Name: Risk, dtype: int64

In [47]:
# Помотрим на признаки
data.columns.tolist()

['Sector_score',
 'LOCATION_ID',
 'PARA_A',
 'Score_A',
 'Risk_A',
 'PARA_B',
 'Score_B',
 'Risk_B',
 'TOTAL',
 'numbers',
 'Score_B.1',
 'Risk_C',
 'Money_Value',
 'Score_MV',
 'Risk_D',
 'District_Loss',
 'PROB',
 'RiSk_E',
 'History',
 'Prob',
 'Risk_F',
 'Score',
 'Inherent_Risk',
 'CONTROL_RISK',
 'Detection_Risk',
 'Audit_Risk',
 'Risk']

In [48]:
# Проверим наличие пропусков в данных
display(data.isna().sum())

Sector_score      0
LOCATION_ID       0
PARA_A            0
Score_A           0
Risk_A            0
PARA_B            0
Score_B           0
Risk_B            0
TOTAL             0
numbers           0
Score_B.1         0
Risk_C            0
Money_Value       1
Score_MV          0
Risk_D            0
District_Loss     0
PROB              0
RiSk_E            0
History           0
Prob              0
Risk_F            0
Score             0
Inherent_Risk     0
CONTROL_RISK      0
Detection_Risk    0
Audit_Risk        0
Risk              0
dtype: int64

In [49]:
data.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
Sector_score,1.0,-0.22,-0.43,-0.22,-0.13,-0.22,-0.13,-0.15,-0.15,-0.17,-0.17,-0.12,-0.32,-0.12,-0.11,-0.087,-0.13,-0.11,-0.14,-0.1,-0.34,-0.17,-0.15,,-0.092,-0.39
PARA_A,-0.22,1.0,0.5,1.0,0.16,0.36,0.16,0.27,0.13,0.14,0.14,0.45,0.29,0.45,0.13,0.044,0.12,0.12,0.17,0.1,0.43,0.48,0.15,,0.22,0.38
Score_A,-0.43,0.5,1.0,0.5,0.25,0.57,0.25,0.3,0.24,0.27,0.27,0.21,0.48,0.2,0.089,0.094,0.1,0.18,0.27,0.15,0.72,0.32,0.17,,0.2,0.62
Risk_A,-0.22,1.0,0.5,1.0,0.17,0.36,0.17,0.27,0.14,0.14,0.14,0.45,0.29,0.45,0.13,0.044,0.12,0.12,0.18,0.11,0.43,0.48,0.15,,0.22,0.39
PARA_B,-0.13,0.16,0.25,0.17,1.0,0.35,1.0,0.99,0.21,0.23,0.22,0.13,0.31,0.12,0.083,0.043,0.079,0.2,0.32,0.2,0.4,0.65,0.19,,0.89,0.26
Score_B,-0.22,0.36,0.57,0.36,0.35,1.0,0.35,0.38,0.28,0.31,0.3,0.21,0.57,0.2,-0.0047,0.093,0.015,0.2,0.31,0.17,0.9,0.37,0.13,,0.21,0.64
Risk_B,-0.13,0.16,0.25,0.17,1.0,0.35,1.0,0.99,0.21,0.23,0.22,0.13,0.31,0.12,0.083,0.043,0.08,0.2,0.32,0.2,0.4,0.65,0.19,,0.89,0.26
TOTAL,-0.15,0.27,0.3,0.27,0.99,0.38,0.99,1.0,0.22,0.24,0.23,0.17,0.34,0.17,0.093,0.046,0.089,0.21,0.33,0.2,0.43,0.69,0.2,,0.89,0.29
numbers,-0.15,0.13,0.24,0.14,0.21,0.28,0.21,0.22,1.0,0.91,0.96,0.19,0.45,0.19,0.13,0.036,0.14,0.2,0.21,0.2,0.5,0.27,0.23,,0.22,0.31
Score_B.1,-0.17,0.14,0.27,0.14,0.23,0.31,0.23,0.24,0.91,1.0,0.99,0.22,0.51,0.22,0.15,0.037,0.16,0.23,0.25,0.22,0.57,0.31,0.26,,0.26,0.35


In [50]:
# Выделим целевую пересенную
# Переменные TOTAL , LOCATION_ID и Detection_Risk исключаем из списка, т.к. TOTAL есть сумма признаков PARA_A и 
# PARA_B, а LOCATION_ID и Detection_Risk не оказывают сильного влияния на прдсказание целевой переменной. 
# Risk_A и Risk_B похоже на PARA_A','PARA_B -убираем
# District_Loss - убираем, после нормализации у него большие значения

feature_names = ['Sector_score', 'Score_A', 'PARA_A', 'Score_B', 'PARA_B', 'numbers',
                 'Score_B.1','Risk_C', 'Money_Value', 'Score_MV', 'PROB', 'RiSk_E', 'Risk_D',
                 'History', 'Prob', 'Risk_F', 'Score', 'Inherent_Risk', 'CONTROL_RISK', 'Audit_Risk']

target_name = 'Risk'

In [51]:
data = data[feature_names + [target_name]]
data.head(2)

Unnamed: 0,Sector_score,Score_A,PARA_A,Score_B,PARA_B,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,...,RiSk_E,Risk_D,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Audit_Risk,Risk
0,3.89,0.6,4.18,0.2,2.5,5.0,0.2,1.0,3.38,0.2,...,0.4,0.676,0,0.2,0.0,2.4,8.574,0.4,1.7148,1
1,3.89,0.2,0.0,0.2,4.83,5.0,0.2,1.0,0.94,0.2,...,0.4,0.188,0,0.2,0.0,2.0,2.554,0.4,0.5108,0


In [52]:
def preprocesing(df):
    
    # Уберём пропуски в данных
    df['Money_Value'] = df['Money_Value'].fillna(df['Money_Value'].mean()) 
    
    # Стандартизируем признаки
    feature_names_for_stand = df[feature_names].select_dtypes(include=['float64']).columns.tolist()

    scaler = StandardScaler()
    stand_features = scaler.fit_transform(df[feature_names_for_stand])

    df[feature_names_for_stand] = pd.DataFrame(stand_features, columns=feature_names_for_stand)
    
    # Разбиваем данные
    X_data = df.iloc[:,:-1]
    y_data = df.iloc[:,-1]

    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [53]:
X_train, X_test, y_train, y_test = preprocesing(data)

In [54]:
X_train.head(3)

Unnamed: 0,Sector_score,Score_A,PARA_A,Score_B,PARA_B,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,PROB,RiSk_E,Risk_D,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Audit_Risk
335,-0.733008,-0.869761,-0.431736,-0.666752,-0.215778,-0.255998,-0.295285,-0.284812,-0.21253,-0.569895,-0.16502,-0.410417,-0.20692,0,-0.246568,-0.175398,-0.818503,-0.297608,-0.388662,-0.178253
110,-0.670465,1.429846,0.031683,-0.666752,-0.164831,-0.255998,-0.295285,-0.284812,-0.21253,-0.569895,-0.16502,-0.410417,-0.20692,0,-0.246568,-0.175398,-0.352503,-0.259439,-0.388662,-0.167446
82,-0.670465,-0.869761,-0.431736,-0.666752,-0.215778,-0.255998,-0.295285,-0.284812,-0.21253,-0.569895,-0.16502,-0.410417,-0.20692,0,-0.246568,-0.175398,-0.818503,-0.297608,-0.388662,-0.178253


In [55]:
y_train.head(3)

335    0
110    0
82     0
Name: Risk, dtype: int64

### Обучаем несколько моделей

In [56]:
models = {    
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),   
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()    
}

In [57]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

Gradient Boosting trained.
K-Nearest Neighbors trained.
Random Forest trained.
Decision Tree trained.


In [58]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

Gradient Boosting: 100.00%
K-Nearest Neighbors: 95.28%
Random Forest: 100.00%
Decision Tree: 100.00%


In [59]:
for name, model in models.items():    
    print('Classification results:', name)
    y_predict = model.predict(X_test)   
    accuracy = accuracy_score(y_test, y_predict)
    print("accuracy: %.2f%%" % (accuracy * 100.0))
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0))    
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0))
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0))   

Classification results: Gradient Boosting
accuracy: 100.00%
f1: 100.00%
recall: 100.00%
precision: 100.00%
Classification results: K-Nearest Neighbors
accuracy: 95.28%
f1: 93.71%
recall: 89.13%
precision: 98.80%
Classification results: Random Forest
accuracy: 100.00%
f1: 100.00%
recall: 100.00%
precision: 100.00%
Classification results: Decision Tree
accuracy: 100.00%
f1: 100.00%
recall: 100.00%
precision: 100.00%


In [60]:
for name, model in models.items():    
    report = classification_report(y_test, model.predict(X_test), target_names=['Fraud', 'No Fraud'])
    print('Model:', name)    
    print(report)

Model: Gradient Boosting
              precision    recall  f1-score   support

       Fraud       1.00      1.00      1.00       141
    No Fraud       1.00      1.00      1.00        92

    accuracy                           1.00       233
   macro avg       1.00      1.00      1.00       233
weighted avg       1.00      1.00      1.00       233

Model: K-Nearest Neighbors
              precision    recall  f1-score   support

       Fraud       0.93      0.99      0.96       141
    No Fraud       0.99      0.89      0.94        92

    accuracy                           0.95       233
   macro avg       0.96      0.94      0.95       233
weighted avg       0.95      0.95      0.95       233

Model: Random Forest
              precision    recall  f1-score   support

       Fraud       1.00      1.00      1.00       141
    No Fraud       1.00      1.00      1.00        92

    accuracy                           1.00       233
   macro avg       1.00      1.00      1.00       233
w