In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ML_utils import load_data,load_data_test_set, total_day_eve_night_grouping
df =pd.read_csv('train.csv')

### UNCOMMENT THIS PART TO GROUP FEATURES
feature_engineering = True
df=total_day_eve_night_grouping(df,grouping=feature_engineering)
exclude_list = [feature_name for feature_name in df.columns if feature_name not in['number_vmail_messages','number_customer_service_calls','total_charges','churn']]
one_hot_on = True
normalize_on = True
oversample_on=False


###################
#### LOAD DATA ####

X_train,y_train,X_val,y_val,X_test,y_test,columns = load_data(df,exclude=exclude_list,one_hot=one_hot_on,normalize_=normalize_on,oversample=oversample_on)



X_train.shape

(2975, 3)

In [69]:
from sklearn.tree import DecisionTreeRegressor
def predict(X,model,lr):
        y_pred =model[0]
        for i in range(1,len(model)):
            weak_learner_prediction = model[i].predict(X).reshape(-1,1)
            y_pred+=lr*weak_learner_prediction
        return y_pred
    
    
def train_boosted_tree(X_train,y_train,depth=10,lr=0.01,n_estimators=100):
    depth=10
    lr=0.01
    n_estimators=100

    clf = DecisionTreeRegressor(max_depth=depth)
    initial_predict=np.mean(y_train)
    model=[initial_predict]

    prediction=initial_predict*np.ones((X_train.shape[0],1))

    for i in range(n_estimators):
        pseudo_residuals = y_train - prediction
        clf.fit(X_train, pseudo_residuals)
        model.append(clf)
        prediction = predict(X_train,model,lr)
        print(prediction.shape)
        print('Iteration:',i)
        print('MSE:',np.mean((y_train-prediction)**2))
        print('----------------------------------------')
        
    return model
        


In [70]:
model = train_boosted_tree(X_train,y_train,depth=10,lr=0.01,n_estimators=100)
y_pred = predict(X_val,model,0.01)

(2975, 1)
Iteration: 0
MSE: 0.12324520109292363
----------------------------------------
(2975, 1)
Iteration: 1
MSE: 0.12169913395250463
----------------------------------------
(2975, 1)
Iteration: 2
MSE: 0.12019856713828439
----------------------------------------
(2975, 1)
Iteration: 3
MSE: 0.11874172013655454
----------------------------------------
(2975, 1)
Iteration: 4
MSE: 0.1173268992638551
----------------------------------------
(2975, 1)
Iteration: 5
MSE: 0.11595249260402513
----------------------------------------
(2975, 1)
Iteration: 6
MSE: 0.1146172791000083
----------------------------------------
(2975, 1)
Iteration: 7
MSE: 0.11322032951007564
----------------------------------------
(2975, 1)
Iteration: 8
MSE: 0.11205145568134034
----------------------------------------
(2975, 1)
Iteration: 9
MSE: 0.11083017972115722
----------------------------------------
(2975, 1)
Iteration: 10
MSE: 0.10949994548741966
----------------------------------------
(2975, 1)
Iteration: 1

In [71]:
    

print('Validation MSE:',np.mean((y_val-y_pred)**2))

from sklearn.metrics import accuracy_score

print('Validation Accuracy:',accuracy_score(y_val,np.round(y_pred)))
        
    
    
    
    




Validation MSE: 0.06402708980822913
Validation Accuracy: 0.9513343799058085


In [73]:
### USE THE MODEL ON THE TEST SET
df_test=pd.read_csv('test.csv')

df_test=total_day_eve_night_grouping(df_test,grouping=feature_engineering)


X_test_output=load_data_test_set(df_test,exclude=exclude_list+['id'],one_hot=one_hot_on,normalize_=normalize_on)

y_pred_output = predict(X_test_output,model,0.01)

y_pred_output=np.where(y_pred_output>0.5,'yes', 'no')
y_pred_output=y_pred_output.reshape(-1,)
id_column = np.arange(1, y_pred_output.shape[0] + 1)


# Create a DataFrame
df_output = pd.DataFrame({
    'id': id_column,
    'churn': y_pred_output
})


# Save the DataFrame as a CSV file
df_output.to_csv('output_gboost.csv', index=False)