In [23]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split


In [24]:
numerical_cols=['tenure','MonthlyCharges','TotalCharges']
one_hot_cols=['PaymentMethod','InternetService','MultipleLines']
ordinal_cols=['gender','SeniorCitizen','Partner','Dependents','PhoneService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Contract',]

In [25]:
df = pd.read_csv('csvs/churn.csv')

In [26]:
for col in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
    df[col]=df[col].replace('No internet service', 'No')

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges']=df['TotalCharges'].fillna(0)

In [27]:
X= df[numerical_cols+one_hot_cols+ordinal_cols]
y=df['Churn']

In [28]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,random_state=1,stratify=y)

In [29]:
imputer = SimpleImputer()

In [30]:
hot_transformer=Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                ('hot_encoder',OneHotEncoder())])

In [31]:
ordinal_transformer=Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                ('ordinal_encoder',OrdinalEncoder())])

In [32]:
preprocessor = ColumnTransformer([('imputer', imputer,numerical_cols), ('hot_encoder',hot_transformer,one_hot_cols),('ordinal_encoder',ordinal_transformer,ordinal_cols)])

In [34]:
X_train=preprocessor.fit_transform(X_train)
X_valid=preprocessor.transform(X_valid)

In [None]:
model = XGBClassifier(n_jobs=-1,random_state=1,n_estimators=2000, early_stopping_rounds=50,max_depth=5, learning_rate=0.05)

In [None]:
model.predict_proba

In [35]:
df.shape

(7043, 21)