**1. Setup Awal**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, precision_score

print("‚úÖ Setup berhasil! Library siap digunakan.")

**2. Load & Eksplorasi Data**

In [None]:
# Load dataset dari GitHub
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

print("üîç Shape Dataset:", df.shape)
print("\nüìã Sample Data:")
df.head()

In [None]:
# Eksplorasi data
print("‚ÑπÔ∏è Info Dataset:")
df.info()

print("\nüßπ Missing Values:")
df.isnull().sum()

print("\nüìä Statistik Deskriptif:")
df[['tenure', 'MonthlyCharges', 'TotalCharges']].describe()

**3. Data Cleaning & Preprocessing**

In [None]:
# Cleaning data
print("Sebelum cleaning:", df.shape)

# Handle TotalCharges yang berupa string kosong
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Hapus missing values
df = df.dropna()
print("Setelah cleaning:", df.shape)

# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

print("‚úÖ Data cleaning selesai!")

**4. Visualisasi Data Eksploratif**

In [None]:
# Distribusi Churn
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Churn')
plt.title('Distribusi Churn Pelanggan')
plt.show()

# Hubungan tenure vs MonthlyCharges
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='tenure', y='MonthlyCharges', hue='Churn', alpha=0.6)
plt.title('Hubungan Tenure vs Monthly Charges')
plt.show()

**5. MODEL 1 - Regresi Linear**

In [None]:
# MODEL 1: Regresi Linear - Prediksi tenure berdasarkan MonthlyCharges
print("--- MODEL 1: REGRESI LINEAR ---")

X_lin = df[['MonthlyCharges']]
y_lin = df['tenure']

# Split data 80-20
X_train_lin, X_test_lin, y_train_lin, y_test_lin = train_test_split(
    X_lin, y_lin, test_size=0.2, random_state=42
)

# Train model
lin_reg = LinearRegression()
lin_reg.fit(X_train_lin, y_train_lin)

# Predictions
y_pred_lin = lin_reg.predict(X_test_lin)

# Evaluation
mse = mean_squared_error(y_test_lin, y_pred_lin)
r2 = r2_score(y_test_lin, y_pred_lin)

print(f"Koefisien: {lin_reg.coef_[0]:.2f}")
print(f"Intercept: {lin_reg.intercept_:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R¬≤: {r2:.2f}")

**6. Visualisasi Regresi Linear**

In [None]:
# Visualisasi hasil regresi linear
plt.figure(figsize=(12, 6))

# Plot data asli dan prediksi
plt.subplot(1, 2, 1)
plt.scatter(X_test_lin, y_test_lin, alpha=0.7, label='Data Aktual')
plt.plot(X_test_lin, y_pred_lin, color='red', linewidth=2, label='Garis Regresi')
plt.xlabel('Monthly Charges')
plt.ylabel('Tenure (bulan)')
plt.title('Regresi Linear: Monthly Charges vs Tenure')
plt.legend()
plt.grid(True)

# Plot residual
plt.subplot(1, 2, 2)
residuals = y_test_lin - y_pred_lin
plt.scatter(y_pred_lin, residuals, alpha=0.7)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Plot Residual')
plt.grid(True)

plt.tight_layout()
plt.show()

**7. MODEL 2 - Logistic Regression**

In [None]:
# MODEL 2: Logistic Regression - Klasifikasi Churn
print("\n--- MODEL 2: LOGISTIC REGRESSION ---")

# Features: tenure dan MonthlyCharges
X_log = df[['tenure', 'MonthlyCharges']]
y_log = df['Churn']

# Split data dengan stratifikasi
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(
    X_log, y_log, test_size=0.2, random_state=42, stratify=y_log
)

# Train model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_log, y_train_log)

# Predictions
y_pred_log = log_reg.predict(X_test_log)
y_pred_proba = log_reg.predict_proba(X_test_log)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test_log, y_pred_log)
cm = confusion_matrix(y_test_log, y_pred_log)

print(f"Akurasi: {accuracy:.3f}")
print(f"Confusion Matrix:\n{cm}")

**8. Evaluasi Logistic Regression**

In [None]:
# Detailed evaluation
print("Classification Report:")
print(classification_report(y_test_log, y_pred_log))

# Visualisasi Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

# Coefficients interpretation
print("\nKoefisien Model:")
for feature, coef in zip(['tenure', 'MonthlyCharges'], log_reg.coef_[0]):
    print(f"{feature}: {coef:.3f}")

**9. MODEL 3 - Decision Tree**

In [None]:
# MODEL 3: Decision Tree dengan multiple features
print("\n--- MODEL 3: DECISION TREE ---")

# Prepare features dengan one-hot encoding
df_encoded = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)

# Select features
feature_columns = ['tenure', 'MonthlyCharges', 'TotalCharges',
                   'InternetService_Fiber optic', 'InternetService_No',
                   'Contract_One year', 'Contract_Two year',
                   'PaymentMethod_Credit card (automatic)',
                   'PaymentMethod_Electronic check',
                   'PaymentMethod_Mailed check']

X_tree = df_encoded[feature_columns]
y_tree = df_encoded['Churn']

# Split data
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(
    X_tree, y_tree, test_size=0.2, random_state=42, stratify=y_tree
)

# Train model dengan max_depth=3 untuk hindari overfitting
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X_train_tree, y_train_tree)

# Predictions
y_pred_tree = tree_clf.predict(X_test_tree)

# Evaluation
accuracy_tree = accuracy_score(y_test_tree, y_pred_tree)
print(f"Akurasi Decision Tree: {accuracy_tree:.3f}")

**10. Visualisasi Decision Tree**

In [None]:
# Visualisasi pohon keputusan
plt.figure(figsize=(20, 10))
plot_tree(tree_clf,
          feature_names=feature_columns,
          class_names=['No Churn', 'Churn'],
          filled=True,
          rounded=True,
          fontsize=10)
plt.title('Decision Tree untuk Prediksi Churn Pelanggan', fontsize=16)
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': tree_clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

**11. Perbandingan Hasil Model**

In [None]:
# Ringkasan performa ketiga model
print("=== PERBANDINGAN HASIL MODEL ===\n")

print("1. REGRESI LINEAR:")
print(f"   R¬≤ Score: {r2:.3f}")
print(f"   MSE: {mse:.2f}")

print("\n2. LOGISTIC REGRESSION:")
print(f"   Accuracy: {accuracy:.3f}")
print(f"   Precision (Churn): {precision_score(y_test_log, y_pred_log, pos_label=1):.3f}")

print("\n3. DECISION TREE:")
print(f"   Accuracy: {accuracy_tree:.3f}")
print(f"   Fitur Paling Penting: {feature_importance.iloc[0]['feature']}")

# Visualisasi perbandingan akurasi
models = ['Logistic Regression', 'Decision Tree']
accuracies = [accuracy, accuracy_tree]

plt.figure(figsize=(8, 5))
sns.barplot(x=models, y=accuracies)
plt.ylim(0, 1)
plt.title('Perbandingan Akurasi Model Klasifikasi')
plt.ylabel('Accuracy')
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')
plt.show()

# *Interpretasi Hasil & Insight Bisnis*
## Key Findings:

1.   Regresi Linear:
*   Hubungan negatif antara MonthlyCharges dan Tenure
*   Pelanggan dengan biaya tinggi cenderung lebih pendek masa langganannya
2.   Logistic Regression:
*   Tenure memiliki pengaruh negatif terhadap churn
*   MonthlyCharges memiliki pengaruh positif terhadap churn
3. Decision Tree:
*   Tenure adalah fitur paling penting
*   Contract type sangat mempengaruhi keputusan churn

# *Rekomendasi Bisnis*
## Strategic Recommendations:
1. Program Loyalty untuk pelanggan dengan tenure rendah
2. Review Pricing Strategy untuk layanan dengan monthly charges tinggi
3. Promosikan Long-term Contract untuk mengurangi churn
4. Early Warning System berdasarkan aturan decision tree
5. Personalized Retention Campaign untuk segmen berisiko tinggi

