In [89]:
import pandas as pd
import pickle as pickle

import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [90]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training


# getting validation data
val = pd.read_csv("./data/validation_data.csv")

category_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                'PaperlessBilling', 'PaymentMethod', 'Churn']

In [91]:
####### ORIGINAL TRAINING ENCODING #######
column_mapper = {}   
def encode_training(train_data, category_columns=category_columns):
    # drop customer ID: not a feature for training
    train_data.drop("customerID", axis=1, inplace=True)

    # deal witht the space in TotalCharges
    train_data['TotalCharges'].loc[train_data['TotalCharges'] == " "] = '0.0'
    train_data['TotalCharges'] = pd.to_numeric(train_data['TotalCharges'], errors='coerce')
    
    # create a OneHotEncoder object
    encoded_columns = []
    for col in category_columns:
        ohe = OneHotEncoder(drop='if_binary')
        encoded_column = ohe.fit_transform(train_data[[col]])
        column_mapper.update({col: ohe})
        encoded_columns.append(pd.DataFrame(encoded_column.toarray(), columns=ohe.get_feature_names_out([col])))

    # combine the encoded columns with the original dataframe
    encoded_df = pd.concat([train_data.drop(category_columns, axis=1)] + encoded_columns, axis=1)

    return encoded_df, column_mapper



In [92]:
train_encoded, column_mapper = encode_training(train, category_columns=category_columns)

with open('encoding.pkl', 'wb') as f:
    pickle.dump(column_mapper, f)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['TotalCharges'].loc[train_data['TotalCharges'] == " "] = '0.0'


In [93]:
####### LATER DATA ENCODING BASED ON TRAINING #######
def preprocessing(data, encoder_dict):
    # drop customer ID: not a feature for training
    data.drop("customerID", axis=1, inplace=True)

    # deal witht the space in TotalCharges
    data['TotalCharges'].loc[data['TotalCharges'] == " "] = '0.0'
    data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
 
    # apply the column mapper
    for col in category_columns:
        ohe = encoder_dict[col]
        encoded_column = ohe.transform(data[[col]])
        encoded_df = pd.DataFrame(encoded_column.toarray(), columns=ohe.get_feature_names_out([col]))
        data = pd.concat([data.drop(col, axis=1), encoded_df], axis=1)

    return data

val_encoded = preprocessing(val, column_mapper)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['TotalCharges'].loc[data['TotalCharges'] == " "] = '0.0'


In [94]:
train_encoded

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,5,75.15,392.65,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,66,63.85,4264.60,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,42,73.15,3088.25,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,19,69.60,1394.55,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,59,20.20,1192.30,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,0,1,20.20,20.20,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5278,0,2,76.40,151.80,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
5279,0,58,68.40,3972.25,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5280,0,1,75.70,75.70,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [95]:
####### MODEL TRAINING  #######    
def train(train_encoded, val_encoded):
    #split the data into x and y for training and testing
    X_train = train_encoded.drop(['Churn_Yes'], axis=1)
    y_train = train_encoded['Churn_Yes']
    X_test = val_encoded.drop(['Churn_Yes'], axis=1)
    y_test = val_encoded['Churn_Yes']
    
    #initialize and fit the decisiontreeclassifier
    dtc = tree.DecisionTreeClassifier(max_depth=5,random_state=42,criterion='gini')
    dtc.fit(X_train,y_train)

    y_pred = dtc.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    with open('output.txt', 'w') as f:
    # Redirect print statements to the file
        f.write(X_train.to_string(index=False, col_space=10))

    return dtc, accuracy


In [96]:
dtc, accuracy = train(train_encoded, val_encoded)
print('DecisionTreeClassifier accuracy score: {}'.format(accuracy))




DecisionTreeClassifier accuracy score: 0.8304862023653088


In [97]:
# save the model to pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(dtc, f)

In [98]:
def predict(dtc, test_data):
    # Make predictions here
    y_pred = dtc.predict(test_data)
    return  y_pred

In [99]:
def generate_predictions(data):
    # encode the data 
    col_mapper = pickle.load(open('encoding.pkl','rb'))

    #data_encoded = encode(data)
    data_encoded = preprocessing(data, col_mapper)
    data_encoded = data_encoded.drop(['Churn_Yes'], axis=1)

    # Loading model to compare the results
    model = pickle.load(open('model.pkl','rb'))
    prediction = model.predict(data_encoded)
    #prediction = make_predictions(data_encoded, model)
    
    return prediction

In [100]:
customer_data = pd.read_csv("./data/single_row_to_check.csv")
pred = generate_predictions(customer_data)
if bool(pred):
    print("Customer will churn!")
else:
    print("Customer not predicted to churn")

Customer not predicted to churn


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['TotalCharges'].loc[data['TotalCharges'] == " "] = '0.0'
