# Задание к уроку 9

In [1]:
import gc # сборщик мусора

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score


import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, ShuffleSplit, learning_curve
from sklearn import svm

import pickle
import random
import json


In [2]:
# !pip install flask_ngrok

In [3]:
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify
import pandas as pd

In [4]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print(f'F1-score = {f1_score(y_test_true, y_test_pred)}\n')
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [5]:
DATASET_PATH = 'course_project_train.csv'
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


**Описание датасета**

* **Home Ownership** - домовладение
* **Annual Income** - годовой доход
* **Years in current job** - количество лет на текущем месте работы
* **Tax Liens** - налоговые обременения
* **Number of Open Accounts** - количество открытых счетов
* **Years of Credit History** - количество лет кредитной истории
* **Maximum Open Credit** - наибольший открытый кредит
* **Number of Credit Problems** - количество проблем с кредитом
* **Months since last delinquent** - количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротства
* **Purpose** - цель кредита
* **Term** - срок кредита
* **Current Loan Amount** - текущая сумма кредита
* **Current Credit Balance** - текущий кредитный баланс
* **Monthly Debt** - ежемесячный долг
* **Credit Default** - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [6]:
TARGET_NAME = 'Credit Default'

### Обработка выбросов пропусков

In [7]:
def transform(df):
    """Трансформация данных"""

    # Исправляем выбросы по Credit Score - очевидно что там лишний ноль дописали 
    df.loc[(df['Credit Score']>850),'Credit Score'] = df['Credit Score']/10

    # Вместо Nan в "Months since last delinquent" ставим 0, так как логично допустить что когда нет просрочемк то и не вводили

    df.loc[(df['Months since last delinquent'].isnull()),'Months since last delinquent'] = 0

    # Аналогично по банкротствам
#     df.loc[(df['Bankruptcies'].isnull()),'Bankruptcies'] = 0

    # Правим Credit Score и Annual Income у них Nan одновременно, что логично если не завели средний годовой доход то Credit Score не считается
    # Если количество месяцев просрочки меньше 1 и Years in current job не Null:

    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Annual Income'] = df['Monthly Debt']*18

    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Credit Score'] = 729.0
    # Если количество месяцев просрочки больше 0 :
    df.loc[(df['Credit Score'].isnull()),['Annual Income','Credit Score']] = [0,585.0]

    # Правим Years in current job как меньшее из значений
    df.loc[(df['Years in current job'].isnull()),'Years in current job'] = '< 1 year'

    # Есть выбросы в Current Loan Amount ( = 99999999.0),но данная величина очень хорошо коррелирует с Credit Default = 0, 
    # так что оставим как есть
    # но оценив этот признак в разрезе целевой переменной получается что лучше переопределить этот показатель на 100 000
    # так с точки зрения распределения при увеличении Current Loan Amount идет увеличение Credit Default = 1
    df.loc[df['Current Loan Amount']> 1000000,'Current Loan Amount'] = 100000

    # КАТЕГОРИАЛЬНЫЕ ПРИЗНАКИ
    # Объединим некоторые параметры в поле 'Years in current job'
    df.loc[df['Years in current job'].isin(['1 year','4 years','3 years']),'Years in current job'] = '2 years'
    df.loc[df['Years in current job'].isin(['6 years','7 years','8 years','9 years']),'Years in current job'] = '5 years'

    # Аналогично для Purpose
    df.loc[df['Purpose'].isin(['take a trip','buy a car','small business','business loan','wedding','educational expenses',\
                               'buy house','medical bills','moving','major purchase','vacation', 'renewable energy']),'Purpose'] = 'other'

    # Объединим Home Mortgage' и 'Have Mortgage' в колонке 'Home Ownership'
    df.loc[df['Home Ownership'] == 'Have Mortgage','Home Ownership'] = 'Home Mortgage'
    return df

In [8]:
df = transform(df)


-------------

# Обучение модели на Credit Default

In [9]:
SEL_FEATURES = ['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Months since last delinquent',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score'
               ]

In [10]:
# df.describe()

In [11]:
Xc = df[SEL_FEATURES]
yc = df[TARGET_NAME]
CAT_FEATURES = Xc.select_dtypes(include='object').columns.tolist()
# Xtest = df1[SEL_FEATURES]

In [12]:
model_catb_c = catb.CatBoostClassifier(class_weights=[1, 3.0], silent=True, n_estimators=1500,  
                                  max_depth=3, random_state=42)
model_catb_c.fit(Xc, yc, CAT_FEATURES)

y_train_pred = model_catb_c.predict(Xc)

y_train_pred_proba = model_catb_c.predict_proba(Xc)
# threshold_prob = 0.53
# predict_train = np.array([1 if x>=threshold_prob else 0 for x in y_train_pred_proba[:,1]])

# get_classification_report(yc, y_train_pred, yc, predict_train)

In [13]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model_catb_c, file)

In [14]:
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [15]:
jn = {'ID':23,"Home Ownership":"Rent","Annual Income":0.0,"Years in current job":"2 years","Tax Liens":0.0,
      "Number of Open Accounts":9.0,"Years of Credit History":12.5,"Maximum Open Credit":220968.0,
      "Number of Credit Problems":0.0,"Months since last delinquent":70.0,"Purpose":"debt consolidation","Term":"Short Term",
      "Current Loan Amount":162470.0,"Current Credit Balance":105906.0,"Monthly Debt":6813.0,"Credit Score":585.0}

In [16]:
jn1 = {'ID':2,"Home Ownership":"Home Mortgage","Years in current job":"10+ years","Tax Liens":0.0,
       "Number of Open Accounts":16.0,"Years of Credit History":17.0,"Maximum Open Credit":456302.0,
       "Number of Credit Problems":0.0,"Months since last delinquent":70.0,"Purpose":"debt consolidation",
       "Term":"Short Term","Current Loan Amount":217382.0,"Current Credit Balance":213199.0,"Monthly Debt":27559.0}

In [17]:
jn2 = {'ID':3,"Home Ownership":"Home Mortgage","Annual Income":1152540.0,"Years in current job":"2 years","Tax Liens":0.0,
       "Number of Open Accounts":10.0,"Years of Credit History":13.7,"Maximum Open Credit":204600.0,
       "Number of Credit Problems":0.0,"Months since last delinquent":0.0,"Purpose":"debt consolidation","Term":"Short Term",
       "Current Loan Amount":200178.0,"Current Credit Balance":146490.0,"Monthly Debt":18729.0,"Credit Score":7260.0}

In [18]:
ddf = pd.DataFrame(columns = SEL_FEATURES)
ddf.loc[jn['ID']] = jn
# ddf.loc[jn1['ID']] = jn1
# ddf.loc[jn2['ID']] = jn2
ddf

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Months since last delinquent,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
23,Rent,0.0,2 years,9.0,12.5,220968.0,70.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,585.0


In [19]:
ddf = transform(ddf)
ddf

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Months since last delinquent,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
23,Rent,0.0,2 years,9.0,12.5,220968.0,70.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,585.0


In [20]:
y_pred = loaded_model.predict(ddf)
y_pred_proba = loaded_model.predict_proba(ddf)
y_pred, y_pred_proba

(array([1], dtype=int64), array([[0.31882937, 0.68117063]]))

-----

# Запуск Flask

In [21]:
# app = Flask(__name__)
# run_with_ngrok(app)  # Start ngrok when app is run

# @app.route("/a")
# def hello():
#     print('Testttttttt')
#     return "Hello World!"

# if __name__ == '__main__':
#     app.run()

In [26]:
# Обработчики и запуск Flask
app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run

def transform(df):
    """Трансформация данных"""
    # Исправляем выбросы по Credit Score - очевидно что там лишний ноль дописали 
    df.loc[(df['Credit Score']>850),'Credit Score'] = df['Credit Score']/10

    # Вместо Nan в "Months since last delinquent" ставим 0, так как логично допустить что когда нет просрочемк то и не вводили

    df.loc[(df['Months since last delinquent'].isnull()),'Months since last delinquent'] = 0
    # Правим Credit Score и Annual Income у них Nan одновременно, что логично если не завели средний годовой доход то Credit Score не считается
    # Если количество месяцев просрочки меньше 1 и Years in current job не Null:
    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Annual Income'] = df['Monthly Debt']*18

    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Credit Score'] = 729.0
    # Если количество месяцев просрочки больше 0 :
    df.loc[(df['Credit Score'].isnull()),['Annual Income','Credit Score']] = [0,585.0]

    # Правим Years in current job как меньшее из значений
    df.loc[(df['Years in current job'].isnull()),'Years in current job'] = '< 1 year'
    # Есть выбросы в Current Loan Amount ( = 99999999.0),но данная величина очень хорошо коррелирует с Credit Default = 0, 
    # так что оставим как есть
    # но оценив этот признак в разрезе целевой переменной получается что лучше переопределить этот показатель на 100 000
    # так с точки зрения распределения при увеличении Current Loan Amount идет увеличение Credit Default = 1
    df.loc[df['Current Loan Amount']> 1000000,'Current Loan Amount'] = 100000
    # Объединим некоторые параметры в поле 'Years in current job'
    df.loc[df['Years in current job'].isin(['1 year','4 years','3 years']),'Years in current job'] = '2 years'
    df.loc[df['Years in current job'].isin(['6 years','7 years','8 years','9 years']),'Years in current job'] = '5 years'
    # Аналогично для Purpose
    df.loc[df['Purpose'].isin(['take a trip','buy a car','small business','business loan','wedding','educational expenses',\
                               'buy house','medical bills','moving','major purchase','vacation', 'renewable energy']),'Purpose'] = 'other'
    df.loc[df['Home Ownership'] == 'Have Mortgage','Home Ownership'] = 'Home Mortgage'
    return df

SEL_FEATURES = ['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Months since last delinquent',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score'
               ]

@app.route('/predict', methods=['GET', 'POST'])
def predict3():
    try:

        json_input = request.json
        print(json_input)
        ID = json_input["ID"]
        ddf = pd.DataFrame(columns = SEL_FEATURES)
        
        ddf.loc[ID] = json_input
#         print(ddf)
        ddf = transform(ddf)
        
        y_pred = loaded_model.predict(ddf)
#         print(y_pred)
        y_pred_proba = loaded_model.predict_proba(ddf)
        print(y_pred, y_pred_proba)

#         return json.dumps({'ID': ID, 'Credit Default': int(y_pred[0]) , 'Credit Default_probability_1': float(y_pred_proba[0,1])}) 
        return jsonify({'ID': ID, 'Credit Default': int(y_pred[0]) , 'Credit Default_probability_1': float(y_pred_proba[0,1])}) 
    except:

        return "Error"



if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://4479cb7e4f72.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [29/Nov/2020 22:01:39] "POST /predict HTTP/1.1" 200 -


{'ID': 23, 'Home Ownership': 'Rent', 'Annual Income': 0.0, 'Years in current job': '2 years', 'Tax Liens': 0.0, 'Number of Open Accounts': 9.0, 'Years of Credit History': 12.5, 'Maximum Open Credit': 220968.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 70.0, 'Bankruptcies': 0.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 162470.0, 'Current Credit Balance': 105906.0, 'Monthly Debt': 6813.0, 'Credit Score': 585.0}
[1] [[0.31882937 0.68117063]]


127.0.0.1 - - [29/Nov/2020 22:01:41] "POST /predict HTTP/1.1" 200 -


{'ID': 2, 'Home Ownership': 'Home Mortgage', 'Years in current job': '10+ years', 'Tax Liens': 0.0, 'Number of Open Accounts': 16.0, 'Years of Credit History': 17.0, 'Maximum Open Credit': 456302.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 70.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 217382.0, 'Current Credit Balance': 213199.0, 'Monthly Debt': 27559.0}
[1] [[0.44816659 0.55183341]]


127.0.0.1 - - [29/Nov/2020 22:01:42] "POST /predict HTTP/1.1" 200 -


{'ID': 3, 'Home Ownership': 'Home Mortgage', 'Annual Income': 1152540.0, 'Years in current job': '2 years', 'Tax Liens': 0.0, 'Number of Open Accounts': 10.0, 'Years of Credit History': 13.7, 'Maximum Open Credit': 204600.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 0.0, 'Bankruptcies': 0.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 200178.0, 'Current Credit Balance': 146490.0, 'Monthly Debt': 18729.0, 'Credit Score': 7260.0}
[0] [[0.50262446 0.49737554]]
