In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ML_utils import load_data,load_data_test_set, total_day_eve_night_grouping
df =pd.read_csv('train.csv')

### UNCOMMENT THIS PART TO GROUP FEATURES
feature_engineering = True
df=total_day_eve_night_grouping(df,grouping=feature_engineering)

exclude_list = ['state','area_code']

# exclude_list = [feature_name for feature_name in df.columns if feature_name not in['number_vmail_messages','number_customer_service_calls','total_charges','churn']]

one_hot_on = True
normalize_on = True
oversample_on=True


###################
#### LOAD DATA ####

X_train,y_train,X_val,y_val,X_test,y_test,columns = load_data(df,exclude=exclude_list,one_hot=one_hot_on,normalize_=normalize_on,oversample=oversample_on)


X_train.shape



(2975, 11)

In [6]:
##implementing adaboost
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier

from sklearn.metrics import accuracy_score
import numpy as np

def predict(X,epoch,weak_learners,importances):
    y_pred=np.zeros(X.shape[0])
    for i in range(epoch):
        prediction=weak_learners[i].predict(X)
        prediction[prediction>0.5]=1
        prediction[prediction<=0.5]=-1
        y_pred+=importances[i]*prediction
    y_pred[y_pred>=0]=1
    y_pred[y_pred<0]=0
    return y_pred


In [7]:

epoch_list=[10,20,50,100,400]

train_examples_length=X_train.shape[0]
def ada_boosting(X_train,y_train,epoch_list,weak_learners=[],importances=[],best_acc=0,best_epoch=0):
    y_train_new=y_train.copy()
    y_train_new[y_train_new==0]=-1
    
    
    for epoch in epoch_list:
        for i in range(epoch):
            
            W=np.ones(train_examples_length)/train_examples_length
        
            
            tree=DecisionTreeClassifier(max_depth=2)
            tree.fit(X_train,y_train)
            y_pred=tree.predict(X_train)
            y_pred[y_pred==0]=-1
    
            error=0
            for i in range(len(y_pred)):
                count_W=int(y_pred[i]!=y_train_new[i])
                
                error+=W[i]*count_W
            
            importance=0.5*np.log((1-error)/error)
            importance=importance
        
            importances.append(importance)
            weak_learners.append(tree)
          
                
            
            for i in range(train_examples_length):
                W[i]=W[i]*np.exp(-importance*y_train_new[i]*y_pred[i])
                
            
            W=W/np.sum(W)
            
    
            indices=np.random.choice(X_train.shape[0],size=train_examples_length,p=W.flatten())
            
            
            
            X_train=X_train[indices,:]
    
            y_train=y_train[indices,:]
            y_train_new=y_train_new[indices,:]
          
        acc_valid=accuracy_score(y_val,predict(X_val,epoch,weak_learners,importances))
        if acc_valid>best_acc:
                best_acc=acc_valid
                best_epoch=epoch
    
    return {'weak_learners':weak_learners,'importances':importances,'best_acc':best_acc,'best_epoch':best_epoch}
    
results= ada_boosting(X_train,y_train,epoch_list=epoch_list)

print(results['best_acc'])
print(results['best_epoch'])



0.9167974882260597
10


In [8]:
### USE THE MODEL ON THE TEST SET
df_test=pd.read_csv('test.csv')

df_test=total_day_eve_night_grouping(df_test,grouping=feature_engineering)


X_test_output=load_data_test_set(df_test,exclude=exclude_list+['id'],one_hot=one_hot_on,normalize_=normalize_on)

y_pred_output=predict(X_test_output,results['best_epoch'],results['weak_learners'],results['importances'])
y_pred_output=np.where(y_pred_output>0.5,'yes', 'no')
y_pred_output=y_pred_output.reshape(-1,)
id_column = np.arange(1, y_pred_output.shape[0] + 1)


# Create a DataFrame
df_output = pd.DataFrame({
    'id': id_column,
    'churn': y_pred_output
})


# Save the DataFrame as a CSV file
df_output.to_csv('output_ada_boost.csv', index=False)