In [83]:
import pandas as pd
import joblib
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from google.cloud import storage

In [84]:
# Configure logging
logging.basicConfig(level=logging.INFO)

# 1. Load Data
# Note: In a container, we don't need !gsutil. We can read directly from GCS 
# because we included 'gcsfs' in our requirements.txt earlier.
storage_client = storage.Client()
bucket_name = "telco-customer-dataset"
file_name = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

logging.info(f"Downloading data from gs://{bucket_name}/{file_name}")
df = pd.read_csv(f"gs://{bucket_name}/{file_name}")

In [85]:
type(df)
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [86]:
# 2. Preprocessing
df = df.drop("customerID", axis=1)

In [87]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna()

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# Encode categorical columns
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [89]:
# 3. Train Model with Scaling (to fix ConvergenceWarning)
logging.info("Starting model training...")
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [90]:
# 4. Evaluate
preds = pipeline.predict(X_test)
acc = accuracy_score(y_test, preds)
logging.info(f"Accuracy: {acc}")
print("Acuuracay: ",acc)

Acuuracay:  0.7853589196872779


In [91]:
# 5. Save and Export
model_filename = "model.joblib"
joblib.dump(pipeline, model_filename)
logging.info(f"Model saved locally as {model_filename}")

In [92]:
import logging
from google.cloud import storage
storage_path = "gs://telco-customer-dataset/model.joblib"
blob = storage.blob.Blob.from_string(storage_path, client = storage.Client())
blob.upload_from_filename('model.joblib')
logging.info("model exported to : {}".format(storage_path))