In [2]:
import pandas as pd
import numpy as np
import duckdb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb


In [3]:

db_path = "../Data/capstone.db"
# Veri yükleme (örnek olarak dosya adı 'churn_data.csv')
with duckdb.connect(db_path) as con:
    df = con.sql("select * from churn").df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [4]:
apps = ['İzleGo', 'RitimGo', 'CüzdanX', 'HızlıPazar', 'Konuşalım']
df[apps] = 0
for idx, app_list in enumerate(df["apps"]):
    for app in app_list:
        if app in apps:
            df.loc[idx,app] = 1

df.drop(axis=1,columns=["apps"], inplace=True)

In [5]:
with duckdb.connect(db_path) as con:
    con.execute(("create table if not exists new_churn as select * from df"))

In [5]:
def analyze_data(dataframe, cat_th=10, car_th=20):
    """
    It gives the names of categorical, numerical and categorical but cardinal variables in the data set. It also performs incomplete data analysis.
    Parameters
    ------
        dataframe: dataframe
            The dataframe from which variable names are to be retrieved
        cat_th: int, optional
            Class threshold value for numeric but categorical variables
        car_th: int, optional
            Class threshold for categorical but cardinal variables

    Returns
    ------
        cat_cols: list
            Categorical variable list
        num_cols: list
            Numerik değişken listesi
        cat_but_car: list
            Categorical view cardinal variable list
    """
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == "O"]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != "O"]

    num_but_cat = [col for col in num_cols if dataframe[col].nunique() < cat_th]
    cat_but_car = [col for col in cat_cols if dataframe[col].nunique() > car_th]

    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    cat_cols = cat_cols + num_but_cat
    
    print(f"Number of Observations: {dataframe.shape[0]}")
    print(f"Number of Variables: {dataframe.shape[1]}")
    print(f'Cat cols: {len(cat_cols)}, Num cols: {len(num_cols)}, Cat but car cols: {len(cat_but_car)}')

    return cat_cols, num_cols, cat_but_car

In [53]:
cat_features, num_features, cat_but_car = analyze_data(df)

Number of Observations: 10000000
Number of Variables: 20
Cat cols: 9, Num cols: 10, Cat but car cols: 1


In [7]:
for col in cat_features:
    print(df[col].value_counts())

service_type
Prepaid      3336442
Postpaid     3332346
Broadband    3331212
Name: count, dtype: int64
auto_payment
False    3328768
True     3326484
Name: count, dtype: int64
overdue_payments
0    4446094
2    1112157
5    1112062
1    1110506
3    1109719
4    1109462
Name: count, dtype: int64
churn
False    9866347
True      133653
Name: count, dtype: int64
İzleGo
0    9699720
1     300280
Name: count, dtype: int64
RitimGo
0    9001098
1     998902
Name: count, dtype: int64
CüzdanX
0    9899571
1     100429
Name: count, dtype: int64
HızlıPazar
0    9900086
1      99914
Name: count, dtype: int64
Konuşalım
0    8001366
1    1998634
Name: count, dtype: int64


In [8]:
num_features

['age',
 'tenure',
 'avg_call_duration',
 'data_usage',
 'roaming_usage',
 'monthly_charge',
 'avg_top_up_count',
 'call_drops',
 'customer_support_calls',
 'satisfaction_score']

In [9]:

# Eksik verileri doldurma
df.fillna(df.median(numeric_only=True), inplace=True)  # Sayısal eksik değerleri medyan ile doldur

df.fillna("Unknown", inplace=True)  # Kategorik eksik değerleri 'Unknown' ile doldur

# Hedef değişken (churn)
y = df["churn"].astype(int)
X = df.drop(columns=["id", "churn"])  # ID'yi ve hedef değişkeni çıkar
X["auto_payment"] = X["auto_payment"].astype(str)


In [10]:
import numpy as np
from typing import List, Tuple, Union, Optional

def detect_outliers(data: List[Union[int, float]], 
                   method: str = 'zscore',
                   threshold: float = 3.0) -> Tuple[List[Union[int, float]], List[int]]:
    """
    Detect outliers in a dataset using various statistical methods.
    
    Parameters:
    -----------
    data : List[Union[int, float]]
        Input data as a list of numbers
    method : str
        Method to use for outlier detection:
        - 'zscore': Uses Z-score method
        - 'iqr': Uses Interquartile Range method
        - 'modified_zscore': Uses modified Z-score method
    threshold : float
        Threshold for outlier detection:
        - For zscore/modified_zscore: number of standard deviations (default: 3.0)
        - For IQR: multiplier for IQR range (default: 3.0)
    
    Returns:
    --------
    Tuple[List[Union[int, float]], List[int]]
        - List of outlier values
        - List of indices where outliers were found
    """
    if not data:
        return [], []
    
    # Convert to numpy array for calculations
    arr = np.array(data)
    
    if method.lower() == 'zscore':
        z_scores = np.abs((arr - np.mean(arr)) / np.std(arr))
        outlier_indices = np.where(z_scores > threshold)[0]
    
    elif method.lower() == 'iqr':
        q1 = np.percentile(arr, 25)
        q3 = np.percentile(arr, 75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr
        outlier_indices = np.where((arr < lower_bound) | (arr > upper_bound))[0]
    
    elif method.lower() == 'modified_zscore':
        median = np.median(arr)
        mad = np.median(np.abs(arr - median))
        modified_z_scores = np.abs(0.6745 * (arr - median) / mad)
        outlier_indices = np.where(modified_z_scores > threshold)[0]
    
    else:
        raise ValueError("Method must be one of: 'zscore', 'iqr', 'modified_zscore'")
    
    outlier_values = arr[outlier_indices].tolist()
    return outlier_values, outlier_indices.tolist()

def analyze_outliers(data: List[Union[int, float]], 
                    threshold: float = 3.0) -> dict:
    """
    Analyze outliers using all available methods and return comprehensive results.
    
    Parameters:
    -----------
    data : List[Union[int, float]]
        Input data as a list of numbers
    threshold : float
        Threshold for outlier detection
        
    Returns:
    --------
    dict
        Dictionary containing outlier analysis results for each method
    """
    methods = ['zscore', 'iqr', 'modified_zscore']
    results = {}
    
    for method in methods:
        outliers, indices = detect_outliers(data, method=method, threshold=threshold)
        results[method] = {
            'outliers': outliers,
            'indices': indices,
            'count': len(outliers),
            'percentage': (len(outliers) / len(data)) * 100 if data else 0
        }
    
    return results

In [36]:
list_of_outliers = []
for col in num_features:
    a = analyze_outliers(list(df[col]))
    
    list_of_outliers.extend(a["zscore"]["indices"])


divide by zero encountered in divide


invalid value encountered in divide



In [37]:
len(set(list_of_outliers))

70044

In [14]:
df[cat_features]

Unnamed: 0,service_type,auto_payment,overdue_payments,churn,İzleGo,RitimGo,CüzdanX,HızlıPazar,Konuşalım
0,Prepaid,Unknown,0,False,0,0,0,0,0
1,Prepaid,Unknown,0,False,0,0,0,0,0
2,Postpaid,True,0,False,0,0,0,0,0
3,Prepaid,Unknown,0,False,0,1,0,0,0
4,Prepaid,Unknown,0,False,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
9999995,Broadband,False,3,False,0,0,0,0,1
9999996,Prepaid,Unknown,0,False,0,0,0,0,0
9999997,Prepaid,Unknown,0,False,0,0,0,0,1
9999998,Broadband,True,5,False,0,0,0,0,1


In [16]:

# Sayısal ve kategorik değişkenleri ayırma
cat_features, num_features, cat_but_car = analyze_data(df[columns])

cat_features.remove("churn")

Number of Observations: 10000000
Number of Variables: 20
Cat cols: 9, Num cols: 10, Cat but car cols: 1


In [40]:
import plotly.express as px
px.imshow(df[num_features + ["churn"]+["overdue_payments","İzleGo","RitimGo","CüzdanX","HızlıPazar","Konuşalım"]].corr(), text_auto=True,height=1800)

In [17]:
X[cat_features]

Unnamed: 0,service_type,auto_payment,overdue_payments,İzleGo,RitimGo,CüzdanX,HızlıPazar,Konuşalım
0,Prepaid,Unknown,0,0,0,0,0,0
1,Prepaid,Unknown,0,0,0,0,0,0
2,Postpaid,True,0,0,0,0,0,0
3,Prepaid,Unknown,0,0,1,0,0,0
4,Prepaid,Unknown,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
9999995,Broadband,False,3,0,0,0,0,1
9999996,Prepaid,Unknown,0,0,0,0,0,0
9999997,Prepaid,Unknown,0,0,0,0,0,1
9999998,Broadband,True,5,0,0,0,0,1


In [18]:
X["CüzdanX"] = X["CüzdanX"].astype("int8")
X["age"] = X["age"].astype("int16")
X["HızlıPazar"] = X["HızlıPazar"].astype("int8")
X["Konuşalım"] = X["Konuşalım"].astype("int8")
X["RitimGo"] = X["RitimGo"].astype("int8")
X["overdue_payments"] = X["overdue_payments"].astype("int8")

In [19]:

# Pipeline oluşturma
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

model = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8))
])


In [20]:

# Veri setini eğitim ve test olarak ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [21]:

# Modeli eğitme
model.fit(X_train, y_train)



Parameters: { "use_label_encoder" } are not used.




In [46]:

# Tahmin yapma
y_pred = model.predict(X_test)
y_prob = (model.predict_proba(X_test)[:, 1] > 0.1).astype(int)

# Model değerlendirme
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
print("pr-AUC Score:", average_precision_score(y_test, y_prob))



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



              precision    recall  f1-score   support

           0       0.99      1.00      0.99   1973269
           1       0.00      0.00      0.00     26731

    accuracy                           0.99   2000000
   macro avg       0.49      0.50      0.50   2000000
weighted avg       0.97      0.99      0.98   2000000

ROC-AUC Score: 0.5001263274251228
pr-AUC Score: 0.013385442239347574


In [23]:
from sklearn.metrics import confusion_matrix

In [None]:
def myAccuracy(y_test,y_pred):
    confusion_matrix()

In [42]:
y_pred.sum()

np.int64(0)

In [43]:
y_prob.sum()

np.int64(100)