In [16]:
!pip install -U xgboost -q



In [17]:
!pip install --upgrade twisted -q



In [18]:
!pip install attrs==19.2.0 -i http://mirrors.aliyun.com/pypi/simple --trusted-host mirrors.aliyun.com -q



In [19]:
import pandas as pd

In [20]:
pd.set_option('max_columns', None)

In [21]:
df = pd.read_csv('/datasets/telco-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

target = ['Churn']
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod']
continuous_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

df.drop(df[df['TotalCharges'].str.strip() == ''].index, inplace=True)
df.reset_index(inplace=True, drop=True)
orig = df.copy()

df = pd.get_dummies(df.drop(['customerID'], axis=1), columns = categorical_cols, drop_first=True)
df = pd.get_dummies(df, columns = target, drop_first=True)
df.rename(columns={'Churn_Yes': 'Churn'}, inplace=True)

def min_max_normalize(col):
    return col / col.abs().max()

for col in continuous_cols:
    df[col] = df[col].astype('float64')
    df[col] = min_max_normalize(df[col])

In [33]:
temp = pd.DataFrame(orig.iloc[0, :]).T

In [34]:
temp.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No


In [36]:
temp.to_json(orient='records')

'[{"customerID":"7590-VHVEG","gender":"Female","SeniorCitizen":0,"Partner":"Yes","Dependents":"No","tenure":1,"PhoneService":"No","MultipleLines":"No phone service","InternetService":"DSL","OnlineSecurity":"No","OnlineBackup":"Yes","DeviceProtection":"No","TechSupport":"No","StreamingTV":"No","StreamingMovies":"No","Contract":"Month-to-month","PaperlessBilling":"Yes","PaymentMethod":"Electronic check","MonthlyCharges":29.85,"TotalCharges":"29.85","Churn":"No"}]'

## Train-Test Split

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['Churn'],axis=1), df['Churn'], test_size=0.25, random_state=17)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=17)

## XGBoost

In [19]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_cl = xgb.XGBClassifier(use_label_encoder=False)

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)
print(accuracy_score(y_test, preds))

0.7764505119453925


In [20]:
train_orig = orig[orig.index.isin(X_train.index)]
train_orig.reset_index(drop=True, inplace=True)

train_preds = xgb_cl.predict(X_train)

train_pred = pd.DataFrame(train_preds, columns=['Prediction'])
train_pred['Predicted_Churn'] = 'No'
train_pred.loc[train_pred['Prediction'] == 1, 'Predicted_Churn'] = 'Yes'
train_pred.drop(['Prediction'], axis=1, inplace=True)

combined_train_df = pd.concat([train_orig.reset_index(drop=True), train_pred], axis=1)

In [21]:
test_orig = orig[orig.index.isin(X_test.index)]
test_orig.reset_index(drop=True, inplace=True)

test_preds = xgb_cl.predict(X_test)

test_pred = pd.DataFrame(test_preds, columns=['Prediction'])
test_pred['Predicted_Churn'] = 'No'
test_pred.loc[test_pred['Prediction'] == 1, 'Predicted_Churn'] = 'Yes'
test_pred.drop(['Prediction'], axis=1, inplace=True)

combined_test_df = pd.concat([test_orig.reset_index(drop=True), test_pred], axis=1)
combined_test_df.drop(['Churn'], axis=1, inplace=True)

## Arize

In [22]:
!pip install arize -q



In [23]:
from arize.pandas.logger import Client, Schema
from arize.utils.types import Environments, ModelTypes

SPACE_KEY = "be431ff"
API_KEY = "3b6a922ea5ac74536e7"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "telco-churn-demo-model"  # This is the model name that will show up in Arize
)
model_version = "v1.0"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

✅ Arize setup complete!


In [24]:
feature_cols = combined_train_df.drop(['customerID', 'Churn', 'Predicted_Churn'], axis=1).columns

In [73]:
# Define a Schema() object for Arize to pick up data from the correct columns for logging
training_schema = Schema(
    prediction_id_column_name="customerID",
    prediction_label_column_name="Predicted_Churn",
    actual_label_column_name="Churn",
    feature_column_names=feature_cols,
)

# Logging Training DataFrame
training_response = arize_client.log(
    dataframe=combined_train_df,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.TRAINING,
    schema=training_schema,
)

# If successful, the server will return a status_code of 200
if training_response.status_code != 200:
    print(
        f"logging failed with response code {training_response.status_code}, {training_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjo0Mjg=/spaces/U3BhY2U6NDUx/models/modelName/telco-churn-demo-model?selectedTab=dataIngestion


✅ You have successfully logged training set to Arize


In [25]:
single_pred = pd.DataFrame(combined_test_df.iloc[0, :]).T

In [26]:
## Test a single Prod prediction

# Define a Schema() object for Arize to pick up data from the correct columns for logging
## Add Dates in here somewhere!
prod_schema = Schema(
    prediction_id_column_name="customerID",
    prediction_label_column_name="Predicted_Churn",
    # actual_label_column_name="Churn",
    feature_column_names=feature_cols,
)

# Logging Prod DataFrame
prod_response = arize_client.log(
    dataframe=single_pred,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.PRODUCTION,
    schema=prod_schema,
)

# If successful, the server will return a status_code of 200
if prod_response.status_code != 200:
    print(
        f"logging failed with response code {prod_response.status_code}, {prod_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjo0Mjg=/spaces/U3BhY2U6NDUx/models/modelName/telco-churn-demo-model?selectedTab=dataIngestion


✅ You have successfully logged training set to Arize
