## Terminal

In [1]:
!pip install catboost optuna

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m

## Libraries

In [2]:
import requests
import gzip
import numpy as np
import shutil
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import missingno as msno



## Download and Pandafy data

In [3]:
# Step 1: Download all files
base_url = "https://storage.googleapis.com/sadedegel/dataset/tt-capstone/capstone.{}.jsonl.gz"

for i in range(1, 11):
    url = base_url.format(i)
    response = requests.get(url)
    with open(f"capstone.{i}.jsonl.gz", "wb") as file:
        file.write(response.content)
    print(f"Downloaded capstone.{i}.jsonl.gz")

# Step 2: Unzip the files
for i in range(1, 11):
    gz_file = f"capstone.{i}.jsonl.gz"
    jsonl_file = f"capstone.{i}.jsonl"
    with gzip.open(gz_file, 'rb') as f_in:
        with open(jsonl_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Unzipped {gz_file} to {jsonl_file}")

# Step 3: Load JSONL files into DataFrames and concatenate
dfs = []

for i in range(1, 11):
    jsonl_file = f"capstone.{i}.jsonl"
    df = pd.read_json(jsonl_file, lines=True)
    dfs.append(df)
    print(f"Loaded {jsonl_file} into DataFrame")

df_original = pd.concat(dfs, ignore_index=True)

# Step 4: Verify the combined DataFrame
print(f"Combined DataFrame shape: {df.shape}")
print(df.info())
print(df.head())

Downloaded capstone.1.jsonl.gz
Downloaded capstone.2.jsonl.gz
Downloaded capstone.3.jsonl.gz
Downloaded capstone.4.jsonl.gz
Downloaded capstone.5.jsonl.gz
Downloaded capstone.6.jsonl.gz
Downloaded capstone.7.jsonl.gz
Downloaded capstone.8.jsonl.gz
Downloaded capstone.9.jsonl.gz
Downloaded capstone.10.jsonl.gz
Unzipped capstone.1.jsonl.gz to capstone.1.jsonl
Unzipped capstone.2.jsonl.gz to capstone.2.jsonl
Unzipped capstone.3.jsonl.gz to capstone.3.jsonl
Unzipped capstone.4.jsonl.gz to capstone.4.jsonl
Unzipped capstone.5.jsonl.gz to capstone.5.jsonl
Unzipped capstone.6.jsonl.gz to capstone.6.jsonl
Unzipped capstone.7.jsonl.gz to capstone.7.jsonl
Unzipped capstone.8.jsonl.gz to capstone.8.jsonl
Unzipped capstone.9.jsonl.gz to capstone.9.jsonl
Unzipped capstone.10.jsonl.gz to capstone.10.jsonl
Loaded capstone.1.jsonl into DataFrame
Loaded capstone.2.jsonl into DataFrame
Loaded capstone.3.jsonl into DataFrame
Loaded capstone.4.jsonl into DataFrame
Loaded capstone.5.jsonl into DataFrame
Lo

In [6]:
df = df_original.copy()

In [7]:
df.to_parquet("/content/drive/MyDrive/raw_dataframe.parquet", engine="pyarrow")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/raw_dataframe.csv")
df.head()

Unnamed: 0,id,age,tenure,service_type,avg_call_duration,data_usage,roaming_usage,monthly_charge,overdue_payments,auto_payment,avg_top_up_count,call_drops,customer_support_calls,satisfaction_score,apps,churn
0,51893f29-e6c2-45d3-807c-e1280d3d7b90,18,53.0,Prepaid,106.74,139.72,47.31,59.45,0,,25,18.0,13,1.34,[],False
1,a568caf1-d851-4847-a9f5-20ef9017fa92,26,15.0,Prepaid,31.55,12.14,21.52,1221.65,0,,51,8.0,3,2.57,[],False
2,c611bf0e-a013-44dc-9939-bd33dab16d14,32,152.0,Postpaid,30.64,10.17,31.11,1170.45,0,1.0,0,13.0,10,8.02,[],False
3,9284b7b8-a4ef-49a7-9fa4-333954491f57,33,35.0,Prepaid,,,36.03,2418.91,0,,19,7.0,11,5.96,['RitimGo'],False
4,e205b674-a6e6-43c5-aed7-9497c37c5c82,18,243.0,Prepaid,85.62,164.79,46.4,1518.19,0,,99,15.0,6,8.29,[],False


## Data Preprocessing

### Missing

In [None]:
df['auto_payment_true'] = (df['auto_payment'] == True).astype(int)
df['auto_payment_false'] = (df['auto_payment'] == False).astype(int)
df['auto_payment_unknown'] = df['auto_payment'].isnull().astype(int)
df.drop(columns=['auto_payment'], inplace=True)

In [None]:
high_missing_cols = [col for col in df.columns if df[col].isnull().mean() > 0.25]

In [None]:
# df.value_counts("avg_call_duration")

In [None]:
# msno.matrix(df);

In [None]:
df["service_type"].value_counts()

Unnamed: 0_level_0,count
service_type,Unnamed: 1_level_1
Prepaid,3336442
Postpaid,3332346
Broadband,3331212


In [None]:
# df[df["service_type"]=="Prepaid"].describe() # 0 cols = "overdue_payments", "auto_payment_true", "auto_payment_unknown"
# df[df["service_type"]=="Postpaid"].describe() # 0 cols = "avg_top_up_count"
# df[df["service_type"]=="Broadband"].describe() # 0 cols = "avg_call_duration", "roaming_usage", "avg_top_up_count", "call_drops"

In [None]:
# For Prepaid: set the specified columns to 0
prepaid_cols = ["overdue_payments", "auto_payment_true", "auto_payment_unknown"]
df[prepaid_cols] = df[prepaid_cols].fillna(0, inplace=True)

# For Postpaid: set the specified column to 0
postpaid_cols = ["avg_top_up_count"]
df[postpaid_cols] = df[postpaid_cols].fillna(0, inplace=True)

# For Broadband: set the specified columns to 0
broadband_cols = ["avg_call_duration", "roaming_usage", "avg_top_up_count", "call_drops"]
df[broadband_cols] = df[broadband_cols].fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[prepaid_cols] = df[prepaid_cols].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[postpaid_cols] = df[postpaid_cols].fillna(0, inplace=True)
  df[broadband_cols] = df[broadband_cols].fillna(0)


In [None]:
any_missing_cols = [col for col in df.columns if df[col].isnull().sum() > 1]
any_missing_cols

['tenure',
 'data_usage',
 'monthly_charge',
 'overdue_payments',
 'auto_payment_true',
 'auto_payment_unknown']

In [None]:
from sklearn.linear_model import BayesianRidge
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
# df.fillna(0, inplace=True)
imputer = IterativeImputer(estimator=BayesianRidge(), random_state=42)
df[any_missing_cols] = imputer.fit_transform(df[any_missing_cols])
df.head()




ValueError: Columns must be same length as key

In [None]:
df.value_counts("avg_call_duration")

### One Hot

In [None]:
cat_cols = ["service_type"]
df["apps"].astype(list, inplace=True)
df.head()

In [None]:

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the 'apps' column
apps_encoded = mlb.fit_transform(df['apps'])

# Create a DataFrame from the encoded matrix with appropriate column names
apps_df = pd.DataFrame(apps_encoded, columns=mlb.classes_, index=df.index)

# Concatenate the new one-hot encoded columns with the original DataFrame,
# dropping the original 'apps' column if desired
df_encoded = df.join(apps_df)
df_encoded.drop(columns=['apps'], inplace=True)



In [None]:
df.head()

In [None]:
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df[cat_cols])

# Create a DataFrame with the encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded,
                          columns=encoder.get_feature_names_out(cat_cols))

# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df.drop(cat_cols, axis=1), one_hot_df], axis=1)


In [None]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, '.2f')))

In [None]:
df.head()

In [None]:
msno.matrix(df);

In [None]:
df.info()

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# Define the file path in your Google Drive
file_path = '/content/drive/My Drive/dataframe.csv'

# Export the DataFrame to a CSV file without the index column
df.to_csv(file_path, index=False)


In [None]:
df = pd.read_csv("/content/drive/My Drive/dataframe.csv")

In [None]:
df.tail()

In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Here we drop the "id" column because it’s an identifier.
X = df.drop(columns=['churn', 'id'])
y = df['churn']

In [None]:
cat_features = list(X.select_dtypes(include=['object', 'category']).columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler,LabelEncoder
# from sklearn.metrics import accuracy_score, roc_curve, recall_score, confusion_matrix, roc_auc_score, precision_score
# import optuna
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
# import lightgbm as lgb
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

# accuracy= []
# recall =[]
# roc_auc= []
# precision = []
# # With scale_pos_weight=5, minority class gets 5 times more impact and 5 times more correction than errors made on majority class.
# catboost_5 = CatBoostClassifier(verbose=False,random_state=42,scale_pos_weight=25)
# #Train the Model
# catboost_5.fit(X_train, y_train,eval_set=(X_test, y_test))
# #Take Predictions
# y_pred = catboost_5.predict(X_test)
# #Calculate Metrics
# accuracy.append(round(accuracy_score(y_test, y_pred),4))
# recall.append(round(recall_score(y_test, y_pred),4))
# roc_auc.append(round(roc_auc_score(y_test, y_pred),4))
# precision.append(round(precision_score(y_test, y_pred),4))
# model_names = ['Catboost_adjusted_weight_5']
# result_df1 = pd.DataFrame({'Accuracy':accuracy,'Recall':recall, 'Roc_Auc':roc_auc, 'Precision':precision}, index=model_names)
# result_df1

In [None]:
# import xgboost as xgb
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, roc_auc_score
# from imblearn.over_sampling import SMOTE

# # -------------------------------
# # Assume you already have your data:
# # X_train, X_test, y_train, y_test
# # If you have categorical features, ensure they are preprocessed (e.g., one-hot encoding) since XGBoost requires numeric input.
# # -------------------------------

# # Apply SMOTE to the training data to balance the classes
# smote = SMOTE(k_neighbors=3, random_state=42)
# X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# # Initialize the XGBClassifier. Adjust parameters as needed.
# model = XGBClassifier(
#     n_estimators=200,           # equivalent to iterations in CatBoost
#     learning_rate=0.03,
#     max_depth=3,
#     objective='binary:logistic',  # for binary classification
#     use_label_encoder=False,       # to avoid label encoder warnings
#     eval_metric='logloss'          # evaluation metric similar to CatBoost's loss_function
# )


# # Train the model using the SMOTE-resampled training data
# model.fit(X_train_sm, y_train_sm)

# # Make predictions on the test set.
# y_pred = model.predict(X_test)

# # For AUC, get the predicted probabilities for the positive class.
# y_pred_proba = model.predict_proba(X_test)[:, 1]

# # Evaluate the model’s performance.
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print("F1 Score:", f1_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred))
# print("AUC:", roc_auc_score(y_test, y_pred_proba))


In [None]:
# import xgboost as xgb
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score, roc_auc_score

# # If X_train and X_test are pandas DataFrames and you have a list of categorical features,
# # you can one-hot encode them as follows (optional if already encoded):
# # import pandas as pd
# # X_train = pd.get_dummies(X_train, columns=cat_features)
# # X_test = pd.get_dummies(X_test, columns=cat_features)
# #
# # # Make sure X_train and X_test have the same columns after encoding:
# # X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# # Initialize the XGBClassifier. Adjust parameters as needed.
# model = XGBClassifier(
#     n_estimators=200,           # equivalent to iterations in CatBoost
#     learning_rate=0.03,
#     max_depth=3,
#     objective='binary:logistic',  # for binary classification
#     use_label_encoder=False,      # suppress a warning related to label encoding
#     eval_metric='logloss'         # evaluation metric similar to CatBoost's loss_function
# )

# # Train the model. (Ensure that X_train is numeric and preprocessed if needed.)
# model.fit(X_train, y_train)

# # Make predictions on the test set.
# y_pred = model.predict(X_test)

# # For AUC, get the predicted probabilities for the positive class.
# y_pred_proba = model.predict_proba(X_test)[:, 1]

# # Evaluate the model’s performance.
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))
# print("F1 Score:", f1_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred))
# print("AUC:", roc_auc_score(y_test, y_pred_proba))


In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt

# # Assuming the model has already been trained as in your code snippet:
# # model.fit(X_train, y_train, cat_features=cat_features)

# # Get the feature importances from the trained model.
# # CatBoost's get_feature_importance() returns an array of importance scores.
# feature_importances = model.get_feature_importance()

# # If X_train is a DataFrame, get the column names.
# # If you're using a NumPy array, you might manually specify feature names in a list.
# if hasattr(X_train, "columns"):
#     feature_names = X_train.columns
# else:
#     # Replace the following list with your actual feature names if necessary.
#     feature_names = [f'Feature {i}' for i in range(X_train.shape[1])]

# # Combine the feature names and their corresponding importances into a DataFrame.
# importance_df = pd.DataFrame({
#     'Feature': feature_names,
#     'Importance': feature_importances
# })

# # Sort the DataFrame by importance (highest first).
# importance_df = importance_df.sort_values(by='Importance', ascending=False)

# # Print the feature importances.
# print("Feature Importances:")
# print(importance_df)

# # Optionally, plot the feature importances.
# plt.figure(figsize=(10, 6))
# plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
# plt.xlabel('Importance')
# plt.ylabel('Feature')
# plt.title('CatBoost Feature Importances')
# plt.gca().invert_yaxis()  # Invert y-axis so the most important features are on top.
# plt.tight_layout()
# plt.show()
