In [46]:
import pandas as pd

In [47]:
uw_path = r"../userfiles/UW_Churn_Pred_Data.xls"

# List all sheet names
xls = pd.ExcelFile(uw_path)
print("Sheets in UW_Churn_Pred_Data.xls:", xls.sheet_names)

Sheets in UW_Churn_Pred_Data.xls: ['Data', 'Legend', 'N10', 'B30 Pro', 'Data Before Feb 13']


In [48]:
sheets_with_churn = ["N10", "B30 Pro", "Data Before Feb 13"]

dfs = {s: pd.read_excel(uw_path, sheet_name=s) for s in sheets_with_churn}

churn_cols = ['Chrn Flag', 'Churn', 'Churn Flag']

for name, df in dfs.items():
    # Find the churn column in this sheet
    for col in churn_cols:
        if col in df.columns:
            df['Churn'] = df[col]  # unify name
            break
    # Drop original churn-like columns
    for col in churn_cols:
        if col in df.columns and col != 'Churn':
            df.drop(columns=col, inplace=True)

In [49]:
# Focus on the "B30 Pro" sheet for comparison
df_b30 = dfs["B30 Pro"]

# Check column names in "B30 Pro"
print("Column names in 'B30 Pro':", df_b30.columns)

# Separate data into Churn = 0 and Churn = 1
df_churn_0 = df_b30[df_b30['Churn'] == 0]  # Churn = 0
df_churn_1 = df_b30[df_b30['Churn'] == 1]  # Churn = 1

# Filter only numeric columns for comparison
numeric_cols = df_b30.select_dtypes(include=['number']).columns

# Compare means of numerical columns
comparison = pd.DataFrame({
    'Churn = 0': df_churn_0[numeric_cols].mean(),
    'Churn = 1': df_churn_1[numeric_cols].mean()
})

# Optionally, calculate additional statistics (std, median)
comparison['Churn = 0 std'] = df_churn_0[numeric_cols].std()
comparison['Churn = 1 std'] = df_churn_1[numeric_cols].std()
comparison['Churn = 0 median'] = df_churn_0[numeric_cols].median()
comparison['Churn = 1 median'] = df_churn_1[numeric_cols].median()

# Show comparison result
print(comparison)

Column names in 'B30 Pro': Index(['model', 'sim_info', 'register_email', 'interval_date',
       'last_boot_date', 'active_date', 'last boot - interval',
       'last boot - active', 'return - activate', 'Return date', 'Channel',
       'Type', 'Warranty', 'Analysis and Verification', 'Defect / Damage type',
       'Responsible Party', 'Spare Parts Usage', 'Final Status', 'Churn'],
      dtype='object')
                      Churn = 0  Churn = 1  Churn = 0 std  Churn = 1 std  \
last boot - interval  -5.286295  14.132320      16.566998      13.217187   
last boot - active    56.229436  17.244678      45.987778      13.048939   
return - activate      0.079381   9.937274       2.496092      14.664233   
Churn                  0.000000   1.000000       0.000000       0.000000   

                      Churn = 0 median  Churn = 1 median  
last boot - interval         -2.176516         16.647616  
last boot - active           46.037627         18.860249  
return - activate             0.000

In [50]:
# Focus on the "B30 Pro" sheet for comparison
df_b30 = dfs["B30 Pro"]

# Separate data into Churn = 0 and Churn = 1
df_churn_0 = df_b30[df_b30['Churn'] == 0]  # Churn = 0
df_churn_1 = df_b30[df_b30['Churn'] == 1]  # Churn = 1

# Count the number of 'True' (registered) and 'False' (not registered) for each group
churn_0_registration_counts = df_churn_0['register_email'].value_counts()
churn_1_registration_counts = df_churn_1['register_email'].value_counts()

# Number of people who registered email in each group (True count)
registered_email_churn_0 = churn_0_registration_counts.get(True, 0)
registered_email_churn_1 = churn_1_registration_counts.get(True, 0)

# Number of people who did not register email in each group (False count)
not_registered_email_churn_0 = churn_0_registration_counts.get(False, 0)
not_registered_email_churn_1 = churn_1_registration_counts.get(False, 0)

# Create a summary for the comparison
email_comparison = {
    'Churn = 0 - Registered Email (True)': registered_email_churn_0,
    'Churn = 1 - Registered Email (True)': registered_email_churn_1,
    'Churn = 0 - Not Registered Email (False)': not_registered_email_churn_0,
    'Churn = 1 - Not Registered Email (False)': not_registered_email_churn_1
}

# Show comparison result
print("Email Registration Comparison Summary:")
for key, value in email_comparison.items():
    print(f"{key}: {value}")

Email Registration Comparison Summary:
Churn = 0 - Registered Email (True): 2294
Churn = 1 - Registered Email (True): 37
Churn = 0 - Not Registered Email (False): 778
Churn = 1 - Not Registered Email (False): 33


Churned users have much lower email registration, which suggests they might be less engaged and less committed to the service.

Non-churned users tend to register their emails at a much higher rate, possibly because email registration is associated with higher engagement, trust, and communication with the service.

In [51]:
# Focus on the "N10" sheet
df_n10 = dfs["N10"]

# Check for missing values in the 'Churn' column and filter them out
df_n10_filtered = df_n10.dropna(subset=['Churn'])

# Check the shape of the filtered dataframe
print(f"Original shape: {df_n10.shape}")
print(f"Filtered shape (missing Churn removed): {df_n10_filtered.shape}")

# Check unique values in the 'Churn' column for the filtered data
churn_values = df_n10_filtered['Churn'].unique()

# Display the unique values
print(f"Unique values in 'Churn' column after filtering: {churn_values}")

# Optionally, check the count of each unique value in the 'Churn' column
churn_counts = df_n10_filtered['Churn'].value_counts()
print("\nChurn value counts:")
print(churn_counts)

Original shape: (970, 19)
Filtered shape (missing Churn removed): (16, 19)
Unique values in 'Churn' column after filtering: [1.]

Churn value counts:
Churn
1.0    16
Name: count, dtype: int64


In [52]:
# Focus on the 16 churned users from N10 sheet
df_n10_churned = df_n10_filtered

# Describe numerical columns for churned users in N10
numerical_columns = df_n10_churned.select_dtypes(include=['number']).columns
numerical_summary_n10 = df_n10_churned[numerical_columns].describe()

# Check categorical columns for churned users in N10
categorical_columns = df_n10_churned.select_dtypes(include=['object']).columns
categorical_summary_n10 = df_n10_churned[categorical_columns].value_counts()

# Display summaries
print("Numerical Summary for Churned Users in N10:")
print(numerical_summary_n10)

print("\nCategorical Summary for Churned Users in N10:")
print(categorical_summary_n10)


Numerical Summary for Churned Users in N10:
           activate  last boot - interval  last boot - active  \
count  1.600000e+01             16.000000           16.000000   
mean   2.024110e+07             -0.674532            1.991594   
std    2.828162e+01              3.050306            3.704524   
min    2.024103e+07            -10.323530            0.000012   
25%    2.024110e+07             -1.004728            0.005654   
50%    2.024111e+07              0.000000            0.278171   
75%    2.024111e+07              0.172807            1.525570   
max    2.024112e+07              4.052465           12.697998   

       Spare Parts Usage  Churn  
count               16.0   16.0  
mean                 0.0    1.0  
std                  0.0    0.0  
min                  0.0    1.0  
25%                  0.0    1.0  
50%                  0.0    1.0  
75%                  0.0    1.0  
max                  0.0    1.0  

Categorical Summary for Churned Users in N10:
model   sim_info 

In [53]:
# Focus on the "B30 Pro" sheet
df_b30_churned = df_b30[df_b30['Churn'] == 1]

# Describe numerical columns for churned users in B30 Pro
numerical_columns_b30 = df_b30_churned.select_dtypes(include=['number']).columns
numerical_summary_b30 = df_b30_churned[numerical_columns_b30].describe()

# Check categorical columns for churned users in B30 Pro
categorical_columns_b30 = df_b30_churned.select_dtypes(include=['object']).columns
categorical_summary_b30 = df_b30_churned[categorical_columns_b30].value_counts()

# Display summaries
print("\nNumerical Summary for Churned Users in B30 Pro:")
print(numerical_summary_b30)

print("\nCategorical Summary for Churned Users in B30 Pro:")
print(categorical_summary_b30)



Numerical Summary for Churned Users in B30 Pro:
       last boot - interval  last boot - active  return - activate  Churn
count             70.000000           70.000000          70.000000   70.0
mean              14.132320           17.244678           9.937274    1.0
std               13.217187           13.048939          14.664233    0.0
min              -10.323530            0.000012           0.000000    1.0
25%                0.039630            2.598261           0.000000    1.0
50%               16.647616           18.860249           0.000000    1.0
75%               22.349184           26.138866          24.523472    1.0
max               39.992176           39.992188          49.755208    1.0

Categorical Summary for Churned Users in B30 Pro:
model   sim_info                                                                                                                                                                         interval_date        last_boot_date       active_

In [54]:
# Check columns in the N10 sheet to confirm the presence of return-related columns
print("Columns in N10 sheet:", df_n10.columns)

# Define the columns related to the return process that should be dropped
return_related_columns = [
    'Return date', 'Channel', 'Type', 'Warranty', 'Analysis and Verification',
    'Defect / Damage type', 'Responsible Party', 'Spare Parts Usage', 'Final Status'
]

# Drop these columns from the N10 dataset
df_n10.drop(columns=return_related_columns, inplace=True)

# Check the remaining columns after dropping return-related columns
print("Remaining columns in N10 after dropping return-related columns:", df_n10.columns)


Columns in N10 sheet: Index(['activate', 'model', 'sim_info', 'register_email', 'interval_date',
       'last_boot_date', 'active_date', 'last boot - interval',
       'last boot - active', 'Return date', 'Channel', 'Type', 'Warranty',
       'Analysis and Verification', 'Defect / Damage type',
       'Responsible Party', 'Spare Parts Usage', 'Final Status', 'Churn'],
      dtype='object')
Remaining columns in N10 after dropping return-related columns: Index(['activate', 'model', 'sim_info', 'register_email', 'interval_date',
       'last_boot_date', 'active_date', 'last boot - interval',
       'last boot - active', 'Churn'],
      dtype='object')


In [55]:
print("Columns in B30 Pro sheet:", df_b30.columns)

# Define the columns related to the return process
return_related_columns = [
    'return - activate', 'Return date', 'Channel', 'Type', 'Warranty', 
    'Analysis and Verification', 'Defect / Damage type', 
    'Responsible Party', 'Spare Parts Usage', 'Final Status'
]

# Drop these columns from the dataset in both sheets
df_b30.drop(columns=return_related_columns, inplace=True)


Columns in B30 Pro sheet: Index(['model', 'sim_info', 'register_email', 'interval_date',
       'last_boot_date', 'active_date', 'last boot - interval',
       'last boot - active', 'return - activate', 'Return date', 'Channel',
       'Type', 'Warranty', 'Analysis and Verification', 'Defect / Damage type',
       'Responsible Party', 'Spare Parts Usage', 'Final Status', 'Churn'],
      dtype='object')


In [56]:
# Check the remaining columns
print("Remaining columns in B30 Pro after dropping return-related columns:", df_b30.columns)
print("Remaining columns in N10 after dropping return-related columns:", df_n10.columns)


Remaining columns in B30 Pro after dropping return-related columns: Index(['model', 'sim_info', 'register_email', 'interval_date',
       'last_boot_date', 'active_date', 'last boot - interval',
       'last boot - active', 'Churn'],
      dtype='object')
Remaining columns in N10 after dropping return-related columns: Index(['activate', 'model', 'sim_info', 'register_email', 'interval_date',
       'last_boot_date', 'active_date', 'last boot - interval',
       'last boot - active', 'Churn'],
      dtype='object')


In [59]:
# Filter churned users (Churn == 1) from both datasets
df_b30_churned = df_b30[df_b30['Churn'] == 1]
df_n10_churned = df_n10[df_n10['Churn'] == 1]

# Extract the relevant columns for comparison
df_b30_churned_comparison = df_b30_churned[['last boot - active', 'last boot - interval']]
df_n10_churned_comparison = df_n10_churned[['last boot - active', 'last boot - interval']]

# Summarize the statistics for comparison
b30_stats = df_b30_churned_comparison.describe()
n10_stats = df_n10_churned_comparison.describe()

# Print out the comparison statistics
print("B30 Pro Churned Users Statistics:")
print(b30_stats)
print("\nN10 Churned Users Statistics:")
print(n10_stats)


B30 Pro Churned Users Statistics:
       last boot - active  last boot - interval
count           70.000000             70.000000
mean            17.244678             14.132320
std             13.048939             13.217187
min              0.000012            -10.323530
25%              2.598261              0.039630
50%             18.860249             16.647616
75%             26.138866             22.349184
max             39.992188             39.992176

N10 Churned Users Statistics:
       last boot - active  last boot - interval
count           16.000000             16.000000
mean             1.991594             -0.674532
std              3.704524              3.050306
min              0.000012            -10.323530
25%              0.005654             -1.004728
50%              0.278171              0.000000
75%              1.525570              0.172807
max             12.697998              4.052465


In [62]:
import pandas as pd

# Assuming the following dataframes are already defined:
# df_b30 (from B30 Pro), df_n10 (from N10)

# 1. Filter churned users (Churn == 1) from both datasets
df_b30_churned = df_b30[df_b30['Churn'] == 1]
df_n10_churned = df_n10[df_n10['Churn'] == 1]

# 2. Filter non-churned users (Churn == 0) from B30 Pro
df_b30_non_churned = df_b30[df_b30['Churn'] == 0]

# 3. Add a 'source' column to indicate the source of churned users
df_b30_churned.loc[:, 'source'] = 'B30 Pro'
df_n10_churned.loc[:, 'source'] = 'N10'

# 4. Concatenate the churned users from B30 Pro and N10
df_churned_combined = pd.concat([df_b30_churned[['last boot - active', 'last boot - interval', 'Churn', 'source']],
                                 df_n10_churned[['last boot - active', 'last boot - interval', 'Churn', 'source']]])

# 5. Add the non-churned users from B30 Pro to the dataset
df_b30_non_churned.loc[:, 'source'] = 'B30 Pro Non-Churned'
df_combined = pd.concat([df_churned_combined, df_b30_non_churned[['last boot - active', 'last boot - interval', 'Churn', 'source']]])

# 6. Perform the comparison on the combined dataset
summary_stats = df_combined.groupby('source').agg({
    'last boot - active': ['mean', 'std', 'min', 'max'],
    'last boot - interval': ['mean', 'std', 'min', 'max'],
    'Churn': 'mean'
})

# Display the summary statistics
print(summary_stats)


                    last boot - active                                   \
                                  mean        std       min         max   
source                                                                    
B30 Pro                      17.244678  13.048939  0.000012   39.992188   
B30 Pro Non-Churned          56.229436  45.987778  0.000000  139.369063   
N10                           1.991594   3.704524  0.000012   12.697998   

                    last boot - interval                                     \
                                    mean        std         min         max   
source                                                                        
B30 Pro                        14.132320  13.217187  -10.323530   39.992176   
B30 Pro Non-Churned            -5.286295  16.566998 -113.939317  113.381921   
N10                            -0.674532   3.050306  -10.323530    4.052465   

                    Churn  
                     mean  
source            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b30_churned.loc[:, 'source'] = 'B30 Pro'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_n10_churned.loc[:, 'source'] = 'N10'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_b30_non_churned.loc[:, 'source'] = 'B30 Pro Non-Churned'
