In [19]:
import pandas as pd

merchants_df = pd.read_excel('dstakehome_merchants.xlsx')
payments_df = pd.read_excel('dstakehome_payments.xlsx')

In [82]:
print(payments_df.dtypes)
print("\nUnique values in 'merchant' column:")
print(payments_df['merchant'].nunique())
print("\nSample values from 'merchant' column:")
print(payments_df['merchant'].sample(5))
print("\nSample values from 'date' column:")
print(payments_df['date'].sample(5))

date                   object
merchant               object
subscription_volume     int64
checkout_volume         int64
payment_link_volume     int64
total_volume            int64
dtype: object

Unique values in 'merchant' column:
23620

Sample values from 'merchant' column:
92978      8647e6bc
1369726    e8ab7038
892127     1acbeb47
899156     ab890044
853799     a5b7a1e9
Name: merchant, dtype: object

Sample values from 'date' column:
1218071    2042-03-25T00:00:00Z
1260636    2042-04-05T00:00:00Z
424892     2041-08-30T00:00:00Z
858783     2041-12-23T00:00:00Z
517664     2041-09-24T00:00:00Z
Name: date, dtype: object


In [83]:
payments_df['date'] = pd.to_datetime(payments_df['date'], errors='coerce')

In [86]:
print(payments_df.dtypes)
print("\nUnique values in 'merchant' column:")
print(payments_df['merchant'].nunique())
print("\nSample values from 'merchant' column:")
print(payments_df['merchant'].sample(5))
print("\nSample values from 'date' column:")
print(payments_df['date'].sample(5))

date                   datetime64[ns, UTC]
merchant                            object
subscription_volume                  int64
checkout_volume                      int64
payment_link_volume                  int64
total_volume                         int64
dtype: object

Unique values in 'merchant' column:
23620

Sample values from 'merchant' column:
854830     45950849
756450     c8a18889
403652     e2b5e069
671716     62976f72
1119959    40d6469f
Name: merchant, dtype: object

Sample values from 'date' column:
1285159   2042-04-11 00:00:00+00:00
740066    2041-11-23 00:00:00+00:00
1461956   2042-05-25 00:00:00+00:00
475936    2041-09-13 00:00:00+00:00
142014    2041-06-10 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [87]:
# Check current dtype
print("Current dtype of 'merchant':", payments_df['merchant'].dtype)

# Convert to string dtype
payments_df['merchant'] = payments_df['merchant'].astype('string')

# Check new dtype
print("New dtype of 'merchant':", payments_df['merchant'].dtype)

# Check for any non-string values that might have been coerced
non_string = payments_df[payments_df['merchant'].isna()]
if len(non_string) > 0:
    print(f"Found {len(non_string)} non-string values in 'merchant' column:")
    print(non_string)
else:
    print("All values in 'merchant' column are valid strings.")

# Display some sample values
print("\nSample values from 'merchant' column:")
print(payments_df['merchant'].sample(5))

Current dtype of 'merchant': object
New dtype of 'merchant': string
All values in 'merchant' column are valid strings.

Sample values from 'merchant' column:
475786     a8f43db2
485635     93eaa183
1011026    b75b2401
212400     b44fbca4
803717     b00345d7
Name: merchant, dtype: string


In [88]:
# Count total rows
total_rows = len(payments_df)

# Count unique combinations of merchant and date
unique_combinations = payments_df.groupby(['merchant', 'date']).size()
total_unique_combinations = len(unique_combinations)

In [89]:
print(f"Total rows: {total_rows}")
print(f"Unique merchant-date combinations: {total_unique_combinations}")

Total rows: 1577887
Unique merchant-date combinations: 1577865


In [93]:
try:
    # Count total rows
    total_rows = len(payments_df)

    # Count unique combinations of merchant and date
    unique_combinations = payments_df.groupby(['merchant', 'date']).size()
    total_unique_combinations = len(unique_combinations)

    print(f"Total rows: {total_rows}")
    print(f"Unique merchant-date combinations: {total_unique_combinations}")

    if total_rows == total_unique_combinations:
        print("The payments table has one row per merchant per date.")
    else:
        print("There are duplicate merchant-date combinations.")
        
        # Find duplicates
        duplicates = payments_df[payments_df.duplicated(['merchant', 'date'], keep=False)]
        print("\nFirst few duplicate entries:")
        print(duplicates.head())

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("\nLet's try to identify the problematic rows:")
    
    for index, row in payments_df.iterrows():
        try:
            # Try to create a tuple of (merchant, date) for each row
            _ = (row['merchant'], row['date'])
        except:
            print(f"Problematic row at index {index}:")
            print(row)
            break

Total rows: 1577887
Unique merchant-date combinations: 1577865
There are duplicate merchant-date combinations.

First few duplicate entries:
                            date merchant  subscription_volume  \
716215 2041-11-16 00:00:00+00:00        0                    0   
716613 2041-11-16 00:00:00+00:00        0                    0   
798331 2041-12-07 00:00:00+00:00        0                    0   
798746 2041-12-07 00:00:00+00:00        0                    0   
804396 2041-12-09 00:00:00+00:00        0                    0   

        checkout_volume  payment_link_volume  total_volume  
716215                0                    0          8450  
716613              964                    0           964  
798331                0                    0         16550  
798746             1426                    0          1426  
804396                0                    0         27144  


In [94]:
print("\nNull values in 'merchant' column:")
print(payments_df[payments_df['merchant'].isnull()])

print("\nNull values in 'date' column:")
print(payments_df[payments_df['date'].isnull()])

print("\nUnique values in 'merchant' column:")
print(payments_df['merchant'].value_counts().head())

print("\nUnique values in 'date' column:")
print(payments_df['date'].value_counts().head())


Null values in 'merchant' column:
Empty DataFrame
Columns: [date, merchant, subscription_volume, checkout_volume, payment_link_volume, total_volume]
Index: []

Null values in 'date' column:
Empty DataFrame
Columns: [date, merchant, subscription_volume, checkout_volume, payment_link_volume, total_volume]
Index: []

Unique values in 'merchant' column:
merchant
dd62d56a    418
c3942300    418
e2df5adf    418
64dfffd0    418
0dcee7ff    418
Name: count, dtype: Int64

Unique values in 'date' column:
date
2042-06-01 00:00:00+00:00    4739
2042-03-01 00:00:00+00:00    4671
2041-12-01 00:00:00+00:00    4624
2042-04-01 00:00:00+00:00    4620
2042-02-28 00:00:00+00:00    4599
Name: count, dtype: int64


In [95]:
zero_merchant_count = (payments_df['merchant'] == '0').sum()
print(f"Number of rows with merchant ID '0': {zero_merchant_count}")

Number of rows with merchant ID '0': 150


In [97]:
print("Sample of rows with merchant ID '0':")
print(payments_df[payments_df['merchant'] == '0'].sample(15))

Sample of rows with merchant ID '0':
                             date merchant  subscription_volume  \
1410435 2042-05-12 00:00:00+00:00        0                    0   
701936  2041-11-12 00:00:00+00:00        0                    0   
1500051 2042-06-03 00:00:00+00:00        0                    0   
1459627 2042-05-24 00:00:00+00:00        0                    0   
1017929 2042-02-03 00:00:00+00:00        0                    0   
1391408 2042-05-07 00:00:00+00:00        0                    0   
1068741 2042-02-16 00:00:00+00:00        0                    0   
1107765 2042-02-26 00:00:00+00:00        0                    0   
1019154 2042-02-04 00:00:00+00:00        0                    0   
1192997 2042-03-19 00:00:00+00:00        0                    0   
1565653 2042-06-19 00:00:00+00:00        0                    0   
779108  2041-12-02 00:00:00+00:00        0                    0   
1302660 2042-04-15 00:00:00+00:00        0                 1003   
671486  2041-11-04 00:00:

In [98]:
# Count duplicates
duplicates = payments_df[payments_df.duplicated(['merchant', 'date'], keep=False)]
zero_merchant_duplicates = duplicates[duplicates['merchant'] == '0']

print(f"Total duplicate rows: {len(duplicates)}")
print(f"Duplicate rows with merchant ID '0': {len(zero_merchant_duplicates)}")

Total duplicate rows: 42
Duplicate rows with merchant ID '0': 42


In [100]:
total_rows = len(payments_df)
zero_merchant_rows = payments_df[payments_df['merchant'] == '0']
zero_merchant_count = len(zero_merchant_rows)

print(f"Total rows: {total_rows}")
print(f"Rows with merchant ID '0': {zero_merchant_count}")
print(f"Percentage of '0' merchant rows: {zero_merchant_count / total_rows * 100:.2f}%")

# Calculate total volume and '0' merchant volume
total_volume = payments_df['total_volume'].sum()
zero_merchant_volume = zero_merchant_rows['total_volume'].sum()

print(f"\nTotal volume: ${total_volume:,.2f}")
print(f"'0' merchant volume: ${zero_merchant_volume:,.2f}")
print(f"Percentage of volume from '0' merchants: {zero_merchant_volume / total_volume * 100:.2f}%")

# Check subscription volume specifically
total_sub_volume = payments_df['subscription_volume'].sum()
zero_merchant_sub_volume = zero_merchant_rows['subscription_volume'].sum()

print(f"\nTotal subscription volume: ${total_sub_volume:,.2f}")
print(f"'0' merchant subscription volume: ${zero_merchant_sub_volume:,.2f}")
print(f"Percentage of subscription volume from '0' merchants: {zero_merchant_sub_volume / total_sub_volume * 100:.2f}%")

Total rows: 1577887
Rows with merchant ID '0': 150
Percentage of '0' merchant rows: 0.01%

Total volume: $565,343,271,651.00
'0' merchant volume: $2,810,840.00
Percentage of volume from '0' merchants: 0.00%

Total subscription volume: $61,928,378,569.00
'0' merchant subscription volume: $11,832.00
Percentage of subscription volume from '0' merchants: 0.00%


In [103]:
zero_merchant_time_dist = zero_merchant_rows.groupby(zero_merchant_rows['date'].dt.to_period('M'))['total_volume'].sum()

print("\nDistribution of '0' merchant volume over time:")
print(zero_merchant_time_dist)


Distribution of '0' merchant volume over time:
date
2041-06      16874
2041-07      47997
2041-08      14763
2041-09      31989
2041-10      54120
2041-11      57095
2041-12     535114
2042-01     395677
2042-02      90453
2042-03      48666
2042-04    1308371
2042-05     125372
2042-06      84349
Freq: M, Name: total_volume, dtype: int64


  zero_merchant_time_dist = zero_merchant_rows.groupby(zero_merchant_rows['date'].dt.to_period('M'))['total_volume'].sum()


In [104]:
print("\nSummary statistics for '0' merchant rows:")
print(zero_merchant_rows.describe())

print("\nUnique values in other columns for '0' merchant rows:")
for column in zero_merchant_rows.columns:
    if column not in ['date', 'merchant', 'total_volume', 'subscription_volume']:
        print(f"{column}: {zero_merchant_rows[column].nunique()}")


Summary statistics for '0' merchant rows:
       subscription_volume  checkout_volume  payment_link_volume  \
count           150.000000       150.000000           150.000000   
mean             78.880000      1221.373333          1755.600000   
std             268.437635      8316.125080         20415.595392   
min               0.000000         0.000000             0.000000   
25%               0.000000         0.000000             0.000000   
50%               0.000000         0.000000             0.000000   
75%               0.000000         0.000000             0.000000   
max            1007.000000    100000.000000        250000.000000   

        total_volume  
count     150.000000  
mean    18738.933333  
std     71688.339181  
min        45.000000  
25%       965.500000  
50%      3418.500000  
75%      9453.750000  
max    580474.000000  

Unique values in other columns for '0' merchant rows:
checkout_volume: 34
payment_link_volume: 5


In [105]:
total_rows = len(payments_df)
zero_merchant_rows = payments_df[payments_df['merchant'] == '0']
zero_merchant_count = len(zero_merchant_rows)

total_volume = payments_df['total_volume'].sum()
zero_merchant_volume = zero_merchant_rows['total_volume'].sum()

total_sub_volume = payments_df['subscription_volume'].sum()
zero_merchant_sub_volume = zero_merchant_rows['subscription_volume'].sum()

print("Justification for removing '0' merchant rows:")
print(f"- Percentage of '0' merchant rows: {zero_merchant_count / total_rows * 100:.2f}%")
print(f"- Percentage of total volume from '0' merchants: {zero_merchant_volume / total_volume * 100:.2f}%")
print(f"- Percentage of subscription volume from '0' merchants: {zero_merchant_sub_volume / total_sub_volume * 100:.2f}%")

Justification for removing '0' merchant rows:
- Percentage of '0' merchant rows: 0.01%
- Percentage of total volume from '0' merchants: 0.00%
- Percentage of subscription volume from '0' merchants: 0.00%


In [106]:
# Remove the '0' merchant rows
payments_df_cleaned = payments_df[payments_df['merchant'] != '0']

# Verify the removal
print(f"\nOriginal number of rows: {len(payments_df)}")
print(f"Number of rows after cleaning: {len(payments_df_cleaned)}")
print(f"Number of rows removed: {len(payments_df) - len(payments_df_cleaned)}")

# Check if there are any remaining '0' merchant rows
remaining_zero = payments_df_cleaned[payments_df_cleaned['merchant'] == '0']
print(f"Remaining '0' merchant rows: {len(remaining_zero)}")


Original number of rows: 1577887
Number of rows after cleaning: 1577737
Number of rows removed: 150
Remaining '0' merchant rows: 0


In [107]:
duplicates = payments_df_cleaned[payments_df_cleaned.duplicated(['merchant', 'date'], keep=False)]
print(f"\nNumber of duplicate rows after cleaning: {len(duplicates)}")

if len(duplicates) > 0:
    print("Sample of remaining duplicates:")
    print(duplicates.head())
else:
    print("No duplicates remain. Each row now represents a unique merchant-date combination.")


Number of duplicate rows after cleaning: 0
No duplicates remain. Each row now represents a unique merchant-date combination.


In [108]:
print("\nImpact on key metrics:")
print(f"Total volume before cleaning: ${payments_df['total_volume'].sum():,.2f}")
print(f"Total volume after cleaning: ${payments_df_cleaned['total_volume'].sum():,.2f}")
print(f"Subscription volume before cleaning: ${payments_df['subscription_volume'].sum():,.2f}")
print(f"Subscription volume after cleaning: ${payments_df_cleaned['subscription_volume'].sum():,.2f}")


Impact on key metrics:
Total volume before cleaning: $565,343,271,651.00
Total volume after cleaning: $565,340,460,811.00
Subscription volume before cleaning: $61,928,378,569.00
Subscription volume after cleaning: $61,928,366,737.00


In [109]:
payment_merchants = set(payments_df_cleaned['merchant'].unique())
merchant_table_merchants = set(merchants_df['merchant'].unique())

print(f"Unique merchants in payments table: {len(payment_merchants)}")
print(f"Unique merchants in merchants table: {len(merchant_table_merchants)}")

Unique merchants in payments table: 23619
Unique merchants in merchants table: 23620


In [110]:
merchants_only_in_payments = payment_merchants - merchant_table_merchants
merchants_only_in_merchant_table = merchant_table_merchants - payment_merchants

print(f"\nMerchants in payments table but not in merchants table: {len(merchants_only_in_payments)}")
print(f"Merchants in merchants table but not in payments table: {len(merchants_only_in_merchant_table)}")


Merchants in payments table but not in merchants table: 689
Merchants in merchants table but not in payments table: 690


In [111]:
print("\nSample merchants in payments but not in merchants table:")
print(list(merchants_only_in_payments)[:5] if merchants_only_in_payments else "None")

print("\nSample merchants in merchants table but not in payments:")
print(list(merchants_only_in_merchant_table)[:5] if merchants_only_in_merchant_table else "None")


Sample merchants in payments but not in merchants table:
['13649815', '429700000000000033578379137000320443244594870561828764749637851393581339070286201233888783118955277610626772368702317106840268086932214340132827276337478039606687546398669005752219082326621908104880554298720578434112369694081024', '9137276', '92736081', '4538900000']

Sample merchants in merchants table but not in payments:
[0, 6754900000000000, 138239999999999997103645080345472474241398500574124767330959986316629469983527064968800239616, 33115000000000000317836737631730986585282755649358266368, 63467523]


In [112]:
if merchants_only_in_payments:
    missing_merchants_volume = payments_df_cleaned[payments_df_cleaned['merchant'].isin(merchants_only_in_payments)]
    total_volume_missing = missing_merchants_volume['total_volume'].sum()
    total_volume_all = payments_df_cleaned['total_volume'].sum()
    
    print(f"\nTotal volume from merchants missing in merchants table: ${total_volume_missing:,.2f}")
    print(f"Percentage of total volume: {(total_volume_missing / total_volume_all) * 100:.2f}%")


Total volume from merchants missing in merchants table: $5,040,473,294.00
Percentage of total volume: 0.89%


In [113]:
if merchants_only_in_merchant_table:
    merchants_without_payments = merchants_df[merchants_df['merchant'].isin(merchants_only_in_merchant_table)]
    print("\nCharacteristics of merchants without payments:")
    print(merchants_without_payments['industry'].value_counts())
    print("\nFirst charge date range:")
    print(merchants_without_payments['first_charge_date'].describe())


Characteristics of merchants without payments:
industry
Business services                                           100
Personal services                                            58
Software                                                     57
Merchandise                                                  57
Others                                                       49
Clothing & accessory                                         47
Food & drink                                                 45
Digital goods                                                44
Education                                                    40
Religion, politics & other memberships                       28
Travel & lodging                                             24
Grocery & food stores                                        17
Medical services, drugs, testing labs & equipment medium     15
Art & photography                                            15
Charity                                        