In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
uw_path = r"../userfiles/UW_Churn_Pred_Data.xls"

# List all sheet names
xls = pd.ExcelFile(uw_path)
print("Sheets in UW_Churn_Pred_Data.xls:", xls.sheet_names)

# Load a specific sheet (replace 'Sheet1' with the correct name)
df_uw = pd.read_excel(uw_path, sheet_name=xls.sheet_names[0])
print("Shape of loaded sheet:", df_uw.shape)

# Optional: show first 5 rows
df_uw.head()

Sheets in UW_Churn_Pred_Data.xls: ['Data', 'Legend', 'N10', 'B30 Pro', 'Data Before Feb 13']
Shape of loaded sheet: (4349, 28)


Unnamed: 0,Sale Channel,Model,Warranty,Feedback,Verification,Defect / Damage type,Responsible Party,Type,Spare Parts Used if returned,Final Status,...,Bluetooth (# of pairs),Wifi/Internet Connection,Wallpaper,Registered Email,last boot - activate,last boot - interval,interval date,last bootl date,activate date,Age Range
0,B2C 3rd party,B20,Yes,Amazon Return,"no damages, no issues",No defect,Customer: Remorse,Return,No,Refurb A,...,1,True,1,1.0,0.0,0.0,2024-11-18 00:52:52,2024-11-18 00:52:52,2024-11-18 00:52:52,30-40
1,B2C 3rd party,A23 Plus,Yes,Amazon Return,"no damages, no issues",No defect,Customer: Remorse,Return,No,Refurb B,...,0,True,1,1.0,2.3e-05,0.0,2024-11-18 00:35:59,2024-11-18 00:35:59,2024-11-18 00:35:57,30-40
2,B2C 3rd party,A15,Yes,,,,,,,,...,5,True,2,,0.0,0.0,2024-11-17 23:42:12,2024-11-17 23:42:12,2024-11-17 23:42:12,20-30
3,B2C 3rd party,A23 Plus,Yes,Amazon Return,"no damages, no issues",No defect,Customer: Remorse,Return,No,Refurb B,...,0,True,1,0.0,0.151493,0.151481,2024-11-17 23:28:58,2024-11-18 03:07:06,2024-11-17 23:28:57,20-30
4,B2C 3rd party,B20,Yes,Amazon Return,"no damages, no issues",No defect,Customer: Remorse,Return,No,Refurb B,...,1,True,0,1.0,0.0,0.0,2024-11-17 21:50:16,2024-11-17 21:50:16,2024-11-17 21:50:16,40-50


In [18]:
sheets_with_churn = ["N10", "B30 Pro", "Data Before Feb 13"]

dfs = {s: pd.read_excel(uw_path, sheet_name=s) for s in sheets_with_churn}

churn_cols = ['Chrn Flag', 'Churn', 'Churn Flag']

for name, df in dfs.items():
    # Find the churn column in this sheet
    for col in churn_cols:
        if col in df.columns:
            df['Churn'] = df[col]  # unify name
            break
    # Drop original churn-like columns
    for col in churn_cols:
        if col in df.columns and col != 'Churn':
            df.drop(columns=col, inplace=True)

In [24]:
for name in sheets_with_churn:
    df = pd.read_excel(uw_path, sheet_name=name)
    print(f"{name}: {df.shape[0]} rows")


N10: 970 rows
B30 Pro: 3142 rows
Data Before Feb 13: 4995 rows


In [36]:
# Get column sets for N10 and B30 Pro
cols_n10 = set(dfs['N10'].columns)
cols_b30 = set(dfs['B30 Pro'].columns)

# Columns present in both
common_cols = cols_n10.intersection(cols_b30)
print("Columns present in BOTH N10 and B30 Pro:")
print(sorted(common_cols))

# Columns NOT in both (i.e., appear in only one of them)
not_in_both = cols_n10.symmetric_difference(cols_b30)
print("\nColumns NOT present in both (only in one sheet):")
print(sorted(not_in_both))


Columns present in BOTH N10 and B30 Pro:
['Analysis and Verification', 'Channel', 'Churn', 'Defect / Damage type', 'Final Status', 'Responsible Party', 'Return date', 'Spare Parts Usage', 'Type', 'Warranty', 'active_date', 'interval_date', 'last boot - active', 'last boot - interval', 'last_boot_date', 'model', 'register_email', 'return - activat', 'return - activate', 'return_minus_derived', 'sim_info']

Columns NOT present in both (only in one sheet):
['activate']


In [34]:
for name, df in dfs.items():
    if 'Return date' in df.columns and 'active_date' in df.columns:
        # Ensure datetime type
        print(df['Return date'].head(10))
        df['Return date'] = pd.to_datetime(df['Return date'], errors='coerce')
        df['active_date'] = pd.to_datetime(df['active_date'], errors='coerce')

        # Compute return - activate in days
        df['return - activate'] = (df['Return date'] - df['active_date']).dt.total_seconds() / (3600*24)
    else:
        df['return - activate'] = None  # if either column missing

0   2024-11-26 08:10:00
1   2024-12-09 15:50:00
2   2024-12-09 15:50:00
3   2024-12-09 08:00:00
4   2024-12-09 08:00:00
5   2024-12-02 07:50:00
6   2024-12-02 14:40:00
7   2024-12-09 15:50:00
8   2024-12-12 07:40:00
9   2024-12-12 07:40:00
Name: Return date, dtype: datetime64[ns]
0                   NaT
1                   NaT
2                   NaT
3                   NaT
4                   NaT
5                   NaT
6                   NaT
7   2024-09-30 07:50:00
8   2024-09-30 07:50:00
9                   NaT
Name: Return date, dtype: datetime64[ns]


In [35]:
for name, df in dfs.items():
    print(f"\nSheet: {name}")
    
    # Ensure datetime type
    if 'Return date' in df.columns and 'active_date' in df.columns:
        df['Return date'] = pd.to_datetime(df['Return date'], errors='coerce')
        df['active_date'] = pd.to_datetime(df['active_date'], errors='coerce')
    
        # Compute derived return - activat (in days)
        df['return_minus_derived'] = (df['Return date'] - df['active_date']).dt.total_seconds() / (3600*24)
    
        # Compare with existing 'return - activate' if it exists
        if 'return - activate' in df.columns:
            comparison = pd.DataFrame({
                'existing': df['return - activate'],
                'derived': df['return_minus_derived'],
                'difference': df['return - activate'] - df['return_minus_derived']
            })
            # Show first 10 rows for comparison
            display(comparison.head(10))
        else:
            print("No 'return - activate' column in this sheet.")
    else:
        print("Missing Return date or active_date")



Sheet: N10


Unnamed: 0,existing,derived,difference
0,29.285417,29.285417,0.0
1,38.722755,38.722755,0.0
2,37.998229,37.998229,0.0
3,37.629757,37.629757,0.0
4,35.265868,35.265868,0.0
5,27.312373,27.312373,0.0
6,27.588738,27.588738,0.0
7,31.641887,31.641887,0.0
8,31.20294,31.20294,0.0
9,30.089688,30.089688,0.0



Sheet: B30 Pro


Unnamed: 0,existing,derived,difference
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,
7,40.064826,40.064826,0.0
8,17.185556,17.185556,0.0
9,,,



Sheet: Data Before Feb 13
Missing Return date or active_date


In [None]:
# Combine to see all columns at once (for comparison only)
all_columns = sorted(set().union(*[df.columns for df in dfs.values()]))
print(f"All unique columns across sheets: {len(all_columns)}")

# Column presence across sheets
presence = pd.DataFrame(index=all_columns)
for name, df in dfs.items():
    presence[name] = presence.index.isin(df.columns)

presence["present_in_all"] = presence[sheets_with_churn].all(axis=1)
presence["present_in_any"] = presence[sheets_with_churn].any(axis=1)

print("Columns present in all sheets:")
print(presence[presence["present_in_all"]].index.tolist())

print("\nColumns present in only some sheets:")
print(presence[(presence["present_in_all"]==False) & (presence["present_in_any"])].index.tolist())


All unique columns across sheets: 27
Columns present in all sheets:
['Churn', 'Defect / Damage type', 'Final Status', 'Responsible Party', 'Type', 'Warranty', 'active_date', 'interval_date', 'last_boot_date', 'register_email', 'sim_info']

Columns present in only some sheets:
['Analysis and Verification', 'Channel', 'Device number', 'Month', 'Office Date', 'Office Time In', 'Product/Model #', 'Return date', 'Source', 'Spare Parts Usage', 'activate', 'last boot - active', 'last boot - interval', 'model', 'promotion_email', 'return - activate']


In [None]:
# Missing value stats per sheet
missing_stats = {}
for name, df in dfs.items():
    missing_stats[name] = df.isna().mean().sort_values(ascending=False)

missing_df = pd.concat(missing_stats, axis=1)
print("\nTop missing values per sheet:")
display(missing_df.head(20))


Top missing values per sheet:


Unnamed: 0,N10,B30 Pro,Data Before Feb 13
Spare Parts Usage,0.983505,0.0,
Channel,0.983505,0.0,
Return date,0.983505,0.0,
Final Status,0.983505,0.0,0.947347
Churn,0.983505,0.0,0.945145
Warranty,0.983505,0.0,0.947147
Responsible Party,0.983505,0.0,0.947147
Defect / Damage type,0.983505,0.0,0.947147
Analysis and Verification,0.983505,0.0,
Type,0.983505,0.0,0.946146


In [None]:
# Column types per sheet
dtype_map = pd.DataFrame(index=all_columns)
for name, df in dfs.items():
    dtype_map[name] = df.dtypes.astype(str)

print("\nColumn types per sheet:")
display(dtype_map)


Column types per sheet:


Unnamed: 0,N10,B30 Pro,Data Before Feb 13
Analysis and Verification,object,object,
Channel,object,object,
Churn,float64,int64,float64
Defect / Damage type,object,object,object
Device number,,,int64
Final Status,object,object,object
Month,,,object
Office Date,,,datetime64[ns]
Office Time In,,,object
Product/Model #,,,object


In [None]:
# Cardinality of categorical columns
cardinality = {}
for name, df in dfs.items():
    cat_cols = df.select_dtypes(include='object').columns
    card = df[cat_cols].nunique().sort_values(ascending=False)
    cardinality[name] = card

print("\nTop categorical column cardinalities per sheet:")
for name, s in cardinality.items():
    print(f"\n{name}:")
    display(s.head(10))

All unique columns across sheets: 27
Columns present in all sheets:
['Churn', 'Defect / Damage type', 'Final Status', 'Responsible Party', 'Type', 'Warranty', 'active_date', 'interval_date', 'last_boot_date', 'register_email', 'sim_info']

Columns present in only some sheets:
['Analysis and Verification', 'Channel', 'Device number', 'Month', 'Office Date', 'Office Time In', 'Product/Model #', 'Return date', 'Source', 'Spare Parts Usage', 'activate', 'last boot - active', 'last boot - interval', 'model', 'promotion_email', 'return - activate']

Top missing values per sheet:


Unnamed: 0,N10,B30 Pro,Data Before Feb 13
Spare Parts Usage,0.983505,0.0,
Channel,0.983505,0.0,
Return date,0.983505,0.0,
Final Status,0.983505,0.0,0.947347
Churn,0.983505,0.0,0.945145
Warranty,0.983505,0.0,0.947147
Responsible Party,0.983505,0.0,0.947147
Defect / Damage type,0.983505,0.0,0.947147
Analysis and Verification,0.983505,0.0,
Type,0.983505,0.0,0.946146



Column types per sheet:


Unnamed: 0,N10,B30 Pro,Data Before Feb 13
Analysis and Verification,object,object,
Channel,object,object,
Churn,float64,int64,float64
Defect / Damage type,object,object,object
Device number,,,int64
Final Status,object,object,object
Month,,,object
Office Date,,,datetime64[ns]
Office Time In,,,object
Product/Model #,,,object



Top categorical column cardinalities per sheet:

N10:


active_date                  970
last_boot_date               969
interval_date                966
sim_info                     131
Final Status                   2
model                          1
Type                           1
Channel                        1
Warranty                       1
Analysis and Verification      1
dtype: int64


B30 Pro:


active_date                  3141
last_boot_date               3136
interval_date                3102
sim_info                      213
Return date                    19
Analysis and Verification       7
Final Status                    6
Defect / Damage type            6
Responsible Party               4
Channel                         3
dtype: int64


Data Before Feb 13:


active_date             4882
last_boot_date          4874
interval_date           4842
sim_info                 467
Office Time In            39
Defect / Damage type      13
Month                     11
Final Status              10
Responsible Party          5
Product/Model #            3
dtype: int64

In [22]:
for name, df in dfs.items():
    if 'activate' in df.columns and 'active_date' in df.columns:
        # Sample a few rows to see how they relate
        print(f"\nSheet: {name}")
        display(df[['activate', 'active_date']].head(10))



Sheet: N10


Unnamed: 0,activate,active_date
0,20241028,2024-10-28 01:19:00
1,20241031,2024-10-31 22:29:14
2,20241101,2024-11-01 15:52:33
3,20241101,2024-11-01 16:53:09
4,20241104,2024-11-04 01:37:09
5,20241105,2024-11-05 00:20:11
6,20241105,2024-11-05 00:32:13
7,20241108,2024-11-08 00:25:41
8,20241111,2024-11-11 02:47:46
9,20241112,2024-11-12 05:30:51
