In [None]:
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch as pt

from pyspark.sql.functions import when, col, lit
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnan
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,classification_report
from sklearn import linear_model
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

spark = SparkSession.builder \
    .appName("OHE Example") \
    .getOrCreate()


In [None]:
df = spark.read.format("delta").load("/Volumes/workspace/bronze/bronzevolume/data/")
df.printSchema()

In [None]:


# Your categorical columns
catog = [
    "multiplelines","internetservice","onlinesecurity","onlinebackup",
    "deviceprotection","techsupport","streamingtv","streamingmovies",
    "contract","paymentmethod"
]

# Filter existing columns
existing_catog = [c for c in catog if c in df.columns]

df_encoded = df
for cat_col in existing_catog:
    print(f"Processing column: {cat_col}")
    
    # Get unique values for this column
    unique_vals = [row[0] for row in df.select(cat_col).distinct().collect() if row[0] is not None]
    print(f"Unique values in {cat_col}: {unique_vals}")
    
    # Create dummy variables for each unique value
    for val in unique_vals:
        # Clean the value name for column naming
        clean_val = str(val).replace(" ", "_").replace("-", "_").replace("(", "").replace(")", "")
        dummy_col = f"{cat_col}_{clean_val}"
        
        df_encoded = df_encoded.withColumn(
            dummy_col,
            when(col(cat_col) == val, 1).otherwise(0).cast(IntegerType())
        )

In [None]:
print(df_encoded.columns)
display(df_encoded)

In [None]:
cols = ["gender","seniorcitizen","partner","dependents","phoneservice","paperlessbilling"]

renames = ["is_female","is_senior","has_partner","has_dependent","is_phoneservice","is_paperlessbilling"]

for i in range(len(cols)):
    df_encoded = df_encoded.withColumnRenamed(cols[i],renames[i])



In [None]:
lis = ["is_paperlessbilling","has_partner","has_dependent","is_phoneservice","churn"]
for cl in lis:
    df_encoded = df_encoded.withColumn(
        f"{cl}",
        when(col(f"{cl}") == "Yes", "1")
        .when(col(f"{cl}") == "No", "0")
        .otherwise(None)
    )

df_encoded = df_encoded.withColumn(
    "is_female",
    when(col("is_female") == "Female", "1")
    .when(col("is_female") == "Male", "0")
    .otherwise(None)
)


In [None]:
display(df_encoded.columns)

In [None]:
subset=[ 'is_female', 'is_senior', 'has_partner', 'has_dependent', 'tenure', 'is_phoneservice', 'is_paperlessbilling',  'churn',  'multiplelines_No_phone_service', 'multiplelines_Yes', 'multiplelines_No', 'internetservice_DSL', 'internetservice_No', 'internetservice_Fiber_optic', 'onlinesecurity_No_internet_service', 'onlinesecurity_Yes', 'onlinesecurity_No', 'onlinebackup_No_internet_service', 'onlinebackup_Yes', 'onlinebackup_No', 'deviceprotection_No_internet_service', 'deviceprotection_Yes', 'deviceprotection_No', 'techsupport_No_internet_service', 'techsupport_Yes', 'techsupport_No', 'streamingtv_No_internet_service', 'streamingtv_Yes', 'streamingtv_No', 'streamingmovies_No_internet_service', 'streamingmovies_Yes', 'streamingmovies_No', 'contract_One_year', 'contract_Month_to_month', 'contract_Two_year', 'paymentmethod_Electronic_check', 'paymentmethod_Mailed_check', 'paymentmethod_Credit_card_automatic', 'paymentmethod_Bank_transfer_automatic',"monthlycharges","totalcharges"]


subset2=[ 'is_female', 'is_senior', 'has_partner', 'has_dependent', 'tenure', 'is_phoneservice', 'is_paperlessbilling',  'churn',  'multiplelines_No_phone_service', 'multiplelines_Yes', 'multiplelines_No', 'internetservice_DSL', 'internetservice_No', 'internetservice_Fiber_optic', 'onlinesecurity_No_internet_service', 'onlinesecurity_Yes', 'onlinesecurity_No', 'onlinebackup_No_internet_service', 'onlinebackup_Yes', 'onlinebackup_No', 'deviceprotection_No_internet_service', 'deviceprotection_Yes', 'deviceprotection_No', 'techsupport_No_internet_service', 'techsupport_Yes', 'techsupport_No', 'streamingtv_No_internet_service', 'streamingtv_Yes', 'streamingtv_No', 'streamingmovies_No_internet_service', 'streamingmovies_Yes', 'streamingmovies_No', 'contract_One_year', 'contract_Month_to_month', 'contract_Two_year', 'paymentmethod_Electronic_check', 'paymentmethod_Mailed_check', 'paymentmethod_Credit_card_automatic', 'paymentmethod_Bank_transfer_automatic']

df_encoded = df_encoded.select(subset)
for c in subset2:
    df_encoded = df_encoded.withColumn(c, col(c).cast("int"))

display(df_encoded)

In [None]:
df_encoded =df_encoded.select([col(c).alias(c.lower()) for c in df_encoded.columns])
display(df_encoded)

In [None]:
lst2 = ["monthlycharges","totalcharges"]


for c in lst2:
    df_encoded = df_encoded.withColumn(
        c,
        when(col(c) == ' ', None).otherwise(col(c)).cast("double")
    )

In [None]:
lst = ["customerid","_rescued_data"]
df_encoded = df_encoded.drop(*lst)

In [None]:
# Convert Spark DataFrame -> Pandas
df_encoded =df_encoded.fillna(0)
pdf = df_encoded.toPandas()

In [None]:
# Define features and target
X = pdf.drop("churn", axis=1)
y = pdf["churn"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train Logistic Regression
regr = LogisticRegression(max_iter=2500)  # max_iter is often needed for convergence

regr.fit(X_train, y_train)

# Predictions
y_pred = regr.predict(X_test)


In [None]:
feat_importances = pd.Series(regr.coef_[0],index=X.columns)
plt.figure(figsize=(10, 12))  # (width, height) in inches
feat_importances.nlargest(40).plot(kind='barh')
plt.show()

In [None]:
tn,fp,fn,tp = confusion_matrix(y_test,y_pred).ravel()
print(f"TN: {tn}, FP: {fp} \nFN: {fn}, TP: {tp}")

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

f1 = f1_score(y_test,y_pred)
print(f"F1 Score: {f1}")

print(classification_report(y_test,y_pred))