In [1]:
import pandas
import requests
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import BorderlineSMOTE
from imblearn import pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
#   Чтение датасета

df = pandas.read_csv("fraud_dataset.csv", sep="|")
df_for_site = df[['step', 'zipcodeOri', 'zipMerchant']].copy()
df = df.drop(columns=["Unnamed: 0", 'step', 'zipcodeOri', 'zipMerchant'])

In [3]:
#   Подготовка датасета к работе 

df = df.drop_duplicates()
df_for_predict = df.iloc[-50000:]
df = df[~df.isin(df_for_predict)].dropna()
for column in ["customer", 'amount'] + ["age", "gender", 'category', 'merchant']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

In [4]:
#   Подготовка данных к работе

X = df.drop("fraud", axis=1)
y = df["fraud"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_test_1 = X_test.copy()
X_test_2 = X_test.copy()
X_test_3 = X_test.copy()

In [5]:
#   Модель №1
resampling = BorderlineSMOTE()
model = RandomForestClassifier(n_estimators=20)
pip_line = pipeline.Pipeline([('SMOTE', resampling), ('Logistic Regression1', model)])
pip_line.fit(X_train, y_train) 

In [6]:
#   Модель №2
model = AdaBoostClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

In [7]:
#   Модель №3
new_boost = XGBClassifier()
new_boost.fit(X_train, y_train)

In [8]:
#   Предсказания
y_pred = pip_line.predict(X_test)
y_pred_ada = model.predict(X_test)
y_pred_xgbc = new_boost.predict(X_test)

In [9]:
# Evaluate the model
X_test_1['y_pred'] = y_pred
print(len(y_pred[y_pred == 1]), len(y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

# Evaluate the model
X_test_2['y_pred'] = y_pred_ada
print(len(y_pred_ada[y_pred_ada == 1]), len(y_pred_ada))
print("Accuracy:", accuracy_score(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))
print('Confusion matrix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred_ada))

# Evaluate the model
X_test_3['y_pred'] = y_pred_xgbc
print(len(y_pred_xgbc[y_pred_xgbc == 1]), len(y_pred_xgbc))
print("Accuracy:", accuracy_score(y_test, y_pred_xgbc))
print(classification_report(y_test, y_pred_xgbc))
print('Confusion matrix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred_xgbc))


1513 108123
Accuracy: 0.9937571099581033
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    106739
         1.0       0.73      0.80      0.77      1384

    accuracy                           0.99    108123
   macro avg       0.87      0.90      0.88    108123
weighted avg       0.99      0.99      0.99    108123

Confusion matrix:
 [[106337    402]
 [   273   1111]]
1050 108123
Accuracy: 0.9947652210907948
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    106739
         1.0       0.89      0.67      0.77      1384

    accuracy                           0.99    108123
   macro avg       0.94      0.84      0.88    108123
weighted avg       0.99      0.99      0.99    108123

Confusion matrix:
 [[106623    116]
 [   450    934]]
1189 108123
Accuracy: 0.996050794003126
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    106739
       

In [10]:
first = X_test_1[X_test_1['y_pred'] == 1]
second = X_test_2[X_test_2['y_pred'] == 1]
third = X_test_3[X_test_3['y_pred'] == 1]
print(first)
indexes = []
for x in first.index: 
    if x not in indexes: 
        indexes.append(x)

for x in second.index: 
    if x not in indexes: 
        indexes.append(x)

for x in third.index: 
    if x not in indexes: 
        indexes.append(x)

print(len(indexes))
xxx = X_test.copy()
xxx['fraud'] = y_test
a = xxx.loc[indexes]
print(a[a['fraud'] == 1])
print(xxx)

        customer  age  gender  merchant  category  amount  y_pred
205109      1966    3       2        34         4   21477     1.0
141887      3926    4       1        28         5   20002     1.0
458049       837    1       1        34         4   19879     1.0
345633      1922    4       1        28         5   22191     1.0
268722      1686    2       1        34         4   17320     1.0
...          ...  ...     ...       ...       ...     ...     ...
232457      3425    3       1        34         4   18693     1.0
65684       1922    4       1        29         6   15233     1.0
193300      2187    2       1        22         9   18752     1.0
518400       762    3       2        34         4   20607     1.0
114324       394    4       2        34         4   18523     1.0

[1513 rows x 7 columns]
1622
        customer  age  gender  merchant  category  amount  fraud
205109      1966    3       2        34         4   21477    1.0
141887      3926    4       1        28         

In [11]:
#   Имитация работы с настроящими данными

for column in ["customer", 'amount', "age", "gender", 'category', 'merchant']:
    le = LabelEncoder()
    df_for_predict[column] = le.fit_transform(df_for_predict[column])
    
print(df_for_predict)

X = df_for_predict.drop("fraud", axis=1)

y_pred = pip_line.predict(X)
y_pred_ada = model.predict(X)
y_pred_xgbc = new_boost.predict(X)

        customer  age  gender  merchant  category  amount  fraud
544063       968    3       1         4         1    2961      0
544064      2746    2       1        17        12      37      0
544065      3627    2       2         0         4    8026      0
544066      3627    2       2        22         4      17      0
544067      1219    3       1        29        12    3654      0
...          ...  ...     ...       ...       ...     ...    ...
594638      1600    3       1        17        12    2053      0
594639      3285    4       1        17        12    5063      0
594640       514    2       1        30         2    2244      0
594641      1059    5       2        17        12    1446      0
594642      3221    4       1        17        12    2693      0

[50000 rows x 7 columns]


In [12]:
#   Подготовка данных для отправки на сайт
X['y_pred'] = y_pred
X['y_pred_ada'] = y_pred_ada
X['y_pred_xgbc'] = y_pred_xgbc
pred_columns = ['y_pred', 'y_pred_ada', 'y_pred_xgbc']
useless_columns = ['step', 'zipcodeOri', 'zipMerchant']
X['fraud'] = X[pred_columns].apply(lambda x: any(x), axis=1)
X['fraud'] = X['fraud'].apply(lambda x: 1 if x else 0)
X = X.drop(columns=pred_columns)
X[useless_columns] = df_for_site[df_for_site.index.isin(X.index)][useless_columns]

In [17]:
for i, row in X.iterrows(): 
    data = {
        'step': row.step,
        'customer': row.customer,
        'age': row.age, 
        'gender': row.gender,
        'zipcodeOri': row.zipcodeOri,
        'merchant': row.merchant,
        'zipMerchant': row.zipMerchant,
        'category': row.category,
        'amount': row.amount,
        'fraud': 'suspect' if row.fraud else 'approved'
    }
    requests.post(url="http://127.0.0.1:8000/api/v1/feedback/", data=data)
    time.sleep(1)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /api/v1/feedback/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002530ADE1590>: Failed to establish a new connection: [WinError 10061] Подключение не установлено, т.к. конечный компьютер отверг запрос на подключение'))