In [11]:
import pandas as pd

In [12]:

# Load all datasets
transactions = pd.read_csv("transactions_2019data.csv")
fraud_labels = pd.read_json("fraud_labels.json")
card_info = pd.read_csv("cards_data.csv")
users = pd.read_csv("users_data.csv")

print("Transactions shape:", transactions.shape)
print("Fraud labels shape:", fraud_labels.shape)
print("Card info shape:", card_info.shape)
print("Users shape:", users.shape)


Transactions shape: (1159966, 12)
Fraud labels shape: (777339, 2)
Card info shape: (6146, 13)
Users shape: (2000, 14)


In [13]:
card_info = card_info.drop(columns=["card_number", "cvv"], errors="ignore")


In [14]:
transactions = transactions.drop(columns=["merchant_city", "merchant_state", "zip", "errors"], errors="ignore")


In [15]:
users = users.drop(columns=["address", "birth_year", "birth_month", 
                            "retirement_age", "latitude", "longitude"],
                   errors="ignore")


In [16]:
df_step1 = transactions.merge(
    fraud_labels,
    left_on="id",
    right_on="transaction_id",
    how="left"
)

print("After merging fraud labels:", df_step1.shape)



After merging fraud labels: (1159966, 10)


In [17]:
df_step2 = df_step1.merge(
    card_info,
    left_on="card_id",
    right_on="id",
    how="left",
    suffixes=("", "_card")
)

print("After merging card info:", df_step2.shape)



After merging card info: (1159966, 21)


In [18]:
df_master = df_step2.merge(
    users,
    left_on="client_id",
    right_on="id",
    how="left",
    suffixes=("", "_user")
)

print("Final merged dataset shape (df_master):", df_master.shape)


Final merged dataset shape (df_master): (1159966, 29)


In [19]:
df_master.head()


Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,mcc,transaction_id,fraud,...,year_pin_last_changed,card_on_dark_web,id_user,current_age,gender,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards
0,22326462,2019-01-01 00:02:00,496,3186,$119.35,Chip Transaction,30286,4814,22326462.0,No,...,2016,No,496,47,Male,$17810,$36319,$44737,730,4
1,22326465,2019-01-01 00:05:00,1129,2677,$100.00,Chip Transaction,27092,4829,22326465.0,No,...,2011,No,1129,49,Male,$16894,$34449,$36540,686,3
2,22326466,2019-01-01 00:06:00,114,5283,$51.71,Chip Transaction,61195,5541,22326466.0,No,...,2013,No,114,47,Female,$16892,$34441,$907,725,4
3,22326467,2019-01-01 00:06:00,641,2774,$105.30,Swipe Transaction,75781,5411,22326467.0,No,...,2017,No,641,62,Male,$18420,$37556,$0,735,6
4,22326468,2019-01-01 00:10:00,114,5283,$82.00,Chip Transaction,61195,5541,22326468.0,No,...,2013,No,114,47,Female,$16892,$34441,$907,725,4


PROMPT (Issue 1):
After merging my datasets, I noticed that several duplicate ID columns appear in df_master, created during the left merges. These include:
transaction_id (duplicate of id)
id_card (duplicate of card_id)
id_user (duplicate of client_id)
Please generate Pandas code that only drops these duplicate ID columns without affecting the rest of the dataset.
Code Fix for Issue 1:

In [None]:
# Drop duplicate ID columns
df_master = df_master.drop(columns=["transaction_id", "id_card", "id_user"], 
                           errors="ignore")


PROMPT (Issue 2):
My merged dataframe still contains merchant location columns that I previously decided to drop because they are high-cardinality and not useful for modeling. These include:
merchant_city
merchant_state
zip
errors
Please generate Pandas code that drops exactly these columns and nothing else.

In [None]:
# Drop merchant-related unneeded columns
df_master = df_master.drop(columns=["merchant_city", "merchant_state", "zip", "errors"], 
                           errors="ignore")


In [None]:
# Final validation of cleaned dataframe
df_master.info()
df_master.head()
