In [1]:
import re

import lightgbm as lgb
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (average_precision_score, classification_report,
                             confusion_matrix, fbeta_score)
from sklearn.model_selection import train_test_split

In [2]:
def find_threshold(y_pred_final, target_rate=0.01, tolerance=0.00001, max_iter=1000):    
    left = float(y_pred_final.min())
    right = float(y_pred_final.max())
    
    iteration = 0
    best_threshold = None
    best_diff = float('inf')
    
    while iteration < max_iter:
        mid = (left + right) / 2
        current_rate = (y_pred_final > mid).sum() / len(y_pred_final)
        diff = abs(current_rate - target_rate)
        
        if diff < best_diff:
            best_threshold = mid
            best_diff = diff
        
        if diff <= tolerance:
            return mid
        
        # Binary search
        if current_rate > target_rate:
            left = mid
        else:
            right = mid
            
        iteration += 1
        
        if abs(right - left) < tolerance:
            return best_threshold
    
    return best_threshold

In [3]:
df = pd.read_csv("../Data/capstone_final.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4422491 entries, 0 to 4422490
Data columns (total 59 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   Unnamed: 0                        int64  
 1   id                                object 
 2   age                               int64  
 3   tenure                            float64
 4   service_type                      object 
 5   avg_call_duration                 float64
 6   data_usage                        float64
 7   roaming_usage                     float64
 8   monthly_charge                    float64
 9   overdue_payments                  int64  
 10  auto_payment                      int64  
 11  avg_top_up_count                  int64  
 12  call_drops                        float64
 13  customer_support_calls            int64  
 14  satisfaction_score                float64
 15  churn                             int64  
 16  İzleGo                            in

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,age,tenure,service_type,avg_call_duration,data_usage,roaming_usage,monthly_charge,overdue_payments,...,107-188,189-300,301-500,501-inf,18-24,25-34,35-44,45-54,55-64,65+
0,0,bf7ff6b6-5f13-48e6-8622-f0e3d68df951,32,162.0,Broadband,0.0,162.0,0.0,281.63,3,...,True,False,False,False,False,True,False,False,False,False
1,4,9c68ea18-5381-464c-9641-a28b9e965494,51,159.0,Postpaid,60.627371,74.92,58.32,788.59,1,...,True,False,False,False,False,False,False,True,False,False
2,6,e8c012b8-b4fa-4235-8984-7ca28b3c193c,38,207.0,Postpaid,91.22,100.061724,46.43,930.6,1,...,False,True,False,False,False,False,True,False,False,False
3,8,6c05cdf2-1b0f-425d-bc09-5c0c66a437ca,50,109.0,Postpaid,44.48,162.37,45.74,460.61,4,...,True,False,False,False,False,False,False,True,False,False
4,9,906cd5c6-6584-48f9-9d24-b6a97e1f5686,52,219.0,Broadband,0.0,192.13,0.0,246.4,1,...,False,True,False,False,False,False,False,True,False,False


In [6]:
def clean_feature_names(df):
    """
    Clean DataFrame column names to be compatible with LightGBM JSON format.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame with potentially problematic column names
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with cleaned column names and a mapping of old to new names
    """
    # Create a copy of the DataFrame
    df_clean = df.copy()
    
    # Dictionary to store original and cleaned column names
    name_mapping = {}
    
    for col in df.columns:
        # Replace special characters with underscore
        cleaned_name = re.sub(r'[^\w\s]', '_', str(col))
        # Replace spaces with underscore
        cleaned_name = cleaned_name.replace(' ', '_')
        # Ensure the name starts with a letter or underscore
        if not cleaned_name[0].isalpha() and cleaned_name[0] != '_':
            cleaned_name = 'f_' + cleaned_name
        # Remove multiple consecutive underscores
        cleaned_name = re.sub(r'_+', '_', cleaned_name)
        # Remove trailing underscores
        cleaned_name = cleaned_name.rstrip('_')
        
        name_mapping[col] = cleaned_name
        
    # Rename the columns
    df_clean.columns = [name_mapping[col] for col in df.columns]
    
    return df_clean, name_mapping

In [7]:
df, name_mapping = clean_feature_names(df)

In [8]:
X = df.drop(axis=1, columns=["id", "churn", "service_type", "age_group", "tenure_group"])
y = df["churn"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [11]:
def objective(trial):
    params = {
        "objective": "binary",
        "metric": "binary_error",
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    
    # Modeli Eğitme
    dtrain = lgb.Dataset(X_resampled, label=y_resampled)
    gbm = lgb.train(params, dtrain, valid_sets=[dtrain], num_boost_round=100)
    
    # Test Seti Üzerinde Tahmin Yapma
    y_pred = gbm.predict(X_test)
    y_pred = pd.Series(y_pred > find_threshold(y_pred)).astype(int)
    return fbeta_score(y_test, y_pred,beta=2)

In [12]:
#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=50,show_progress_bar=True)

In [13]:
#print("En İyi Parametreler:", study.best_params)
# lgb 
best_params = {'lambda_l1': 9.220621228154233, 'lambda_l2': 3.406326025123661e-05, 'num_leaves': 123, 'feature_fraction': 0.7741812226877789, 'bagging_fraction': 0.5694104742723753, 'bagging_freq': 6, 'min_child_samples': 20}

In [14]:
# Optimum Parametrelerle Model Eğitme
#best_params = study.best_params
best_params["objective"] = "binary"
best_params["metric"] = "binary_error"

final_model = lgb.train(best_params, lgb.Dataset(X_train, label=y_train), num_boost_round=100)

# Test Verisinde Son Modelin Başarısı
y_pred_final = final_model.predict(X_test)

[LightGBM] [Info] Number of positive: 31298, number of negative: 2931770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.119710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5526
[LightGBM] [Info] Number of data points in the train set: 2963068, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.010563 -> initscore=-4.539807
[LightGBM] [Info] Start training from score -4.539807


In [None]:
average_precision_score(y_test,y_pred_final)

0.050541196091667906

In [16]:
y_pred_bool = pd.Series(y_pred_final > find_threshold(y_pred_final)).astype(int)

In [17]:
confusion_matrix(y_test,y_pred_bool)

array([[1430319,   13321],
       [  14505,    1278]], dtype=int64)

In [18]:
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99   1443640
           1       0.09      0.08      0.08     15783

    accuracy                           0.98   1459423
   macro avg       0.54      0.54      0.54   1459423
weighted avg       0.98      0.98      0.98   1459423

