<a href="https://colab.research.google.com/github/innocentmatutu/Machine-learning/blob/main/customer_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


df=pd.read_csv('/content/archive (5).zip')

features=['tenure','MonthlyCharges','TotalCharges','SeniorCitizen','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod']
X=df[features]
y=df['Churn']
df.drop(['Churn'],axis=1,inplace=True)

#Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

#Break of validation set from training set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

#Select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique()<10 and
                    X_train[cname].dtype=='object']

#select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                  X_train[cname].dtype in ['int64','float64']]

#keep selected columns
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()

#preprocessing of numerical data
numerical_transformer = SimpleImputer(strategy='constant')

#preprocessing of categorical data
categorical_transformer = Pipeline(steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

#Bundle preprocessing for numerical data and categorical data
preprocesser = ColumnTransformer(
        transformers = [
            ('num',numerical_transformer,numerical_cols),
            ('cat',categorical_transformer,categorical_cols)
        ]
)

#modle selection
#model = LogisticRegression(max_iter=1000, solver='lbfgs',random_state=1)
model = xgb.XGBClassifier(random_state=1)

#Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor',preprocesser),
    ('model',model)

])

#model fit
my_pipeline.fit(X_train,y_train)

#model predictions
preds= my_pipeline.predict(X_valid)

#model evaluation
print(f'Accuracy Score: {accuracy_score(y_valid,preds)}')
print(f'Recall Score: {recall_score(y_valid,preds,average="weighted")}')
print(f'Precision Score: {precision_score(y_valid, preds, average="weighted")}')
print(f'F1_score: {f1_score(y_valid, preds, average="weighted")}')


#print(df.head())

Accuracy Score: 0.7963094393186657
Recall Score: 0.7963094393186657
Precision Score: 0.7961131933810797
F1_score: 0.7962108716881399


In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score


df=pd.read_csv('/content/archive (5).zip')

features=['tenure','MonthlyCharges','TotalCharges','SeniorCitizen','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod']
X=df[features]
y=df['Churn']
df.drop(['Churn'], axis=1 ,inplace=True)

#Encode the target column
le = LabelEncoder()
y = le.fit_transform(y)

#Split training and testing data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

#Select categorical columns with relatively low cardinality
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique()<10 and
                    X_train[cname].dtype=='object']

#Select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                  X_train[cname].dtype in ['float64','int64']]

#Put together the selected columns
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()

#Preprocess numerical columns
numerical_transformer = SimpleImputer(strategy='constant')

#Preprocess categorical columns
categorical_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])

#Bundle preprocessing of numerical and categorical columns into a pipeline
preprocesser = ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_cols),
        ('cat',categorical_transformer,categorical_cols)
    ]
)

#Model selection
model = KNeighborsClassifier()

#Bundle preprocessing and model code into pipeline
my_pipeline = Pipeline(steps=[
    ('preprocesser',preprocesser),
    ('model',model)
])

#Model fit
my_pipeline.fit(X_train,y_train)

#Model predictions
preds = my_pipeline.predict(X_valid)

#Model evalutions
print(f'Accuracy score: {accuracy_score(y_valid,preds)}')
print(f'Precision score: {precision_score(y_valid,preds,average="weighted")}')
print(f'F1 score: {f1_score(y_valid,preds,average="weighted")}')
print(f'Recall score: {recall_score(y_valid,preds,average="weighted")}')

Accuracy score: 0.7650816181689141
Precision score: 0.7590034934041696
F1 score: 0.7616399995734032
Recall score: 0.7650816181689141
