In [2]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

In [3]:
df = pd.read_csv("churn_data_lesson_9.csv")
df.head(5)

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df_2 = pd.read_csv("customer_data_les_9.csv")
df_2.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents
0,7590-VHVEG,Female,0,Yes,No
1,5575-GNVDE,Male,0,No,No
2,3668-QPYBK,Male,0,No,No
3,7795-CFOCW,Male,0,No,No
4,9237-HQITU,Female,0,No,No


In [5]:
df_3 = pd.read_csv("internet_data_les_9.csv")
df_3.head(5)

Unnamed: 0,customerID,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,7590-VHVEG,No phone service,DSL,No,Yes,No,No,No,No
1,5575-GNVDE,No,DSL,Yes,No,Yes,No,No,No
2,3668-QPYBK,No,DSL,Yes,Yes,No,No,No,No
3,7795-CFOCW,No phone service,DSL,Yes,No,Yes,Yes,No,No
4,9237-HQITU,No,Fiber optic,No,No,No,No,No,No


In [6]:
data=df.merge(df_2,how='inner',on='customerID').merge(df_3,how='inner',on='customerID').drop('customerID',axis=1)
data['Churn']=data['Churn'].replace({'No': 0, 'Yes': 1})
data.head(5)

Unnamed: 0,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,gender,SeniorCitizen,Partner,Dependents,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,Female,0,Yes,No,No phone service,DSL,No,Yes,No,No,No,No
1,34,Yes,One year,No,Mailed check,56.95,1889.5,0,Male,0,No,No,No,DSL,Yes,No,Yes,No,No,No
2,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,1,Male,0,No,No,No,DSL,Yes,Yes,No,No,No,No
3,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,Male,0,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No
4,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,1,Female,0,No,No,No,Fiber optic,No,No,No,No,No,No


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, data['Churn'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [8]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    

class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X  

In [9]:
features = ['PaymentMethod', 'Contract', 'TotalCharges','gender','TechSupport']
target = 'Churn'

In [10]:
# combine
paymentMethod = Pipeline([
                ('imputer', TextImputer('PaymentMethod', '')),
                ('selector', ColumnSelector(key='PaymentMethod')),
                ('tfidf', TfidfVectorizer())
            ])

contract = Pipeline([
                ('imputer', TextImputer('Contract', '')),
                ('selector', ColumnSelector(key='Contract')),
                ('tfidf', TfidfVectorizer())
            ])

gender = Pipeline([
                ('imputer', TextImputer('gender', '')),
                ('selector', ColumnSelector(key='gender')),
                ('tfidf', TfidfVectorizer())
            ])

techSupport = Pipeline([
                ('imputer', TextImputer('TechSupport', '')),
                ('selector', ColumnSelector(key='TechSupport')),
                ('tfidf', TfidfVectorizer())
            ])

totalCharges = Pipeline([
                ('selector', ColumnSelector(key='TotalCharges')),
                ('tfidf', TfidfVectorizer())
            ])


feats = FeatureUnion([('paymentMethod', paymentMethod),
                      ('contract', contract),
                      ('gender', gender),
                      ('techSupport', techSupport),
                      ('totalCharges', totalCharges)])

In [11]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('paymentMethod',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='PaymentMethod',
                                                                              value='')),
                                                                 ('selector',
                                                                  ColumnSelector(key='PaymentMethod')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer())])),
                                                ('contract',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='Contract',
                                                             

In [12]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

In [13]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [14]:
with open('logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [15]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [16]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.3051510460334217, F-Score=0.616, Precision=0.500, Recall=0.804


In [17]:
from flask import Flask, request, jsonify

In [18]:
import dill

with open('logreg_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [19]:
#Сервер
# Обработчики и запуск Flask
app = Flask(__name__)
#run_with_ngrok(app)  # Start ngrok when app is run


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    paymentMethod, contract, totalCharges,gender,techSupport = "", "","","", ""
    request_json = request.get_json()
    
    if request_json["paymentMethod"]:
        description = request_json['paymentMethod']
    
    if request_json["contract"]:
        company_profile = request_json['contract']
                
    if request_json["totalCharges"]:
        benefits = request_json['totalCharges']

    if request_json["gender"]:
        benefits = request_json['gender']    

    if request_json["techSupport"]:
        benefits = request_json['techSupport']  
    
    print(contract)  
    preds = model.predict_proba(pd.DataFrame({"PaymentMethod": [paymentMethod],
                                              "Contract": [contract],
                                              "TotalCharges": [totalCharges],
                                              "gender": [gender],
                                              "TechSupport": [techSupport]
                                              }))
    data["predictions"] = preds[:, 1][0]
        # indicate that the request was a success
    data["success"] = True
    print('OK')
    # data["result"]="what ever i want"

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [20]:
#сторона клиента

paymentMethod, contract, totalCharges,gender,techSupport = ("Electronic check", "Month-to-month","29.85","Female", "No")

body = {
        'paymentMethod': paymentMethod, 
        'contract': contract,
        'totalCharges': totalCharges,
        'gender': gender,
        'techSupport': techSupport
        }

In [21]:
with app.test_client() as t:
    response = t.post('/predict', json=body)
    json_data = response.get_json()

json_data


OK


{'predictions': 0.10144233045054486, 'success': True}