In [2]:
import pandas as pd
import numpy as np
parquet_file_path = 'combined_capstone.parquet'
df = pd.read_parquet(parquet_file_path, engine='pyarrow')


In [3]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, '.2f')))

Unnamed: 0,age,tenure,avg_call_duration,data_usage,roaming_usage,monthly_charge,overdue_payments,auto_payment,avg_top_up_count,call_drops,customer_support_calls,satisfaction_score
count,10000000.0,9975200.0,6335687.0,9500467.0,6668788.0,9749929.0,10000000.0,6655252.0,10000000.0,6668788.0,10000000.0,10000000.0
mean,30.38,158.25,60.51,100.07,29.99,743.77,1.67,0.5,17.52,10.0,10.0,5.5
std,11.4,94.75,34.34,57.7,17.32,597.31,1.83,0.5,29.48,6.06,6.06,2.6
min,18.0,1.0,1.0,0.1,0.0,30.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,20.0,78.0,30.77,50.1,14.98,317.76,0.0,0.0,0.0,5.0,5.0,3.25
50%,28.0,156.0,60.5,100.05,30.0,480.61,1.0,0.0,0.0,10.0,10.0,5.5
75%,38.0,234.0,90.27,150.06,44.99,1018.33,3.0,1.0,29.0,15.0,15.0,7.75
max,80.0,754.0,120.0,200.0,60.0,2500.0,5.0,1.0,100.0,20.0,20.0,10.0


In [4]:
df['auto_payment_true'] = (df['auto_payment'] == True).astype(int)
df['auto_payment_false'] = (df['auto_payment'] == False).astype(int)
df['auto_payment_unknown'] = df['auto_payment'].isnull().astype(int)
df.drop(columns=['auto_payment'], inplace=True)

df.columns

Index(['id', 'age', 'tenure', 'service_type', 'avg_call_duration',
       'data_usage', 'roaming_usage', 'monthly_charge', 'overdue_payments',
       'avg_top_up_count', 'call_drops', 'customer_support_calls',
       'satisfaction_score', 'apps', 'churn', 'auto_payment_true',
       'auto_payment_false', 'auto_payment_unknown'],
      dtype='object')

In [5]:
df.loc[df["service_type"] == "Broadband", ["avg_call_duration", "roaming_usage", "call_drops"]] = 0
df["tenure"] = df["tenure"].fillna(df["tenure"].mean())
df["avg_call_duration"] = df["avg_call_duration"].fillna(df["avg_call_duration"].mean())
df["data_usage"] = df["data_usage"].fillna(df["data_usage"].mean())
df["monthly_charge"] = df["monthly_charge"].fillna(df["monthly_charge"].mean())

In [6]:
product_df = df[["id","monthly_charge","churn"]]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 18 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   id                      object 
 1   age                     int64  
 2   tenure                  float64
 3   service_type            object 
 4   avg_call_duration       float64
 5   data_usage              float64
 6   roaming_usage           float64
 7   monthly_charge          float64
 8   overdue_payments        int64  
 9   avg_top_up_count        int64  
 10  call_drops              float64
 11  customer_support_calls  int64  
 12  satisfaction_score      float64
 13  apps                    object 
 14  churn                   bool   
 15  auto_payment_true       int64  
 16  auto_payment_false      int64  
 17  auto_payment_unknown    int64  
dtypes: bool(1), float64(7), int64(7), object(3)
memory usage: 1.3+ GB


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()
# Fit and transform the 'apps' column
apps_encoded = mlb.fit_transform(df['apps'])
# Create a DataFrame from the encoded matrix with appropriate column names
apps_df = pd.DataFrame(apps_encoded, columns=mlb.classes_, index=df.index)
# Concatenate the new one-hot encoded columns with the original DataFrame,
# dropping the original 'apps' column if desired
df = pd.concat([df.drop('apps', axis=1), apps_df], axis=1)


In [9]:
encoder = OneHotEncoder(sparse_output=False)
cat_cols = ["service_type"]
one_hot_encoded = encoder.fit_transform(df[cat_cols])

# Create a DataFrame with the encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded,
                          columns=encoder.get_feature_names_out(cat_cols))

# Concatenate the one-hot encoded columns with the original DataFrame
df = pd.concat([df.drop(cat_cols, axis=1), one_hot_df], axis=1)
df.columns

Index(['id', 'age', 'tenure', 'avg_call_duration', 'data_usage',
       'roaming_usage', 'monthly_charge', 'overdue_payments',
       'avg_top_up_count', 'call_drops', 'customer_support_calls',
       'satisfaction_score', 'churn', 'auto_payment_true',
       'auto_payment_false', 'auto_payment_unknown', 'CüzdanX', 'HızlıPazar',
       'Konuşalım', 'RitimGo', 'İzleGo', 'service_type_Broadband',
       'service_type_Postpaid', 'service_type_Prepaid'],
      dtype='object')

### New Features

In [10]:

# Helper function for safe division
def safe_divide(numerator, denominator, fill_value=0):
    # Replace zeros in the denominator with np.nan to avoid division by zero.
    denominator = denominator.replace(0, np.nan)
    # Perform the division.
    result = numerator / denominator
    # Replace any inf or -inf with NaN and fill NaN values.
    result = result.replace([np.inf, -np.inf], np.nan).fillna(fill_value)
    return result

# Apply safe_divide to all columns where division is used:
df["data_per_month"] = safe_divide(df["data_usage"], df["tenure"])
df["charge_per_gb"] = safe_divide(df["monthly_charge"], df["data_usage"])
df["support_call_per_month"] = safe_divide(df["customer_support_calls"], df["tenure"])
df["support_call_percent"] = safe_divide(df["customer_support_calls"], df["roaming_usage"])
df["drop_per_life"] = safe_divide(df["call_drops"], df["tenure"])
df["overdue_per_usr"] = safe_divide(df["overdue_payments"], df["tenure"])

# Now process the remaining calculations that don't involve division-by-zero issues:
df["total_apps_used"] = (df['CüzdanX'] + df['HızlıPazar'] +
                         df['Konuşalım'] + df['RitimGo'] + df['İzleGo'])

df["overdue_but_auto_payment"] = df["overdue_payments"] * df["auto_payment_true"]
df["overdue_but_auto_payment"] = df["overdue_but_auto_payment"].fillna(0)

df["yıllar_boyu_memnun"] = df["tenure"] * df["satisfaction_score"]

df["problematic"] = df["customer_support_calls"] * df["call_drops"] * df["overdue_payments"]


In [11]:

# Binning the 'tenure' column into groups
tenure_bins = [0, 6, 12, 24, 48, 2000]
tenure_labels = ['0-6', '6-12', '12-24', '24-48', '48+']
df['tenure_group'] = pd.cut(df['tenure'], bins=tenure_bins, labels=tenure_labels, right=False)

# Binning the 'age' column into groups
age_bins = [0, 18, 25, 35, 45, 100]
age_labels = ['0-18', '18-25', '25-35', '35-45', '45+']
df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)

# Apply one-hot encoding to the newly created categorical columns
df = pd.get_dummies(df, columns=['tenure_group', 'age_group'])

# Check the new shape of the DataFrame
print(df.shape)


(10000000, 44)


In [12]:
df.to_parquet('processed.parquet', engine='pyarrow')