# Exploratory Data Analysis

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams["figure.figsize"] = (14, 8)
sns.set_theme(context="notebook", style="whitegrid")

## Config

In [2]:
# file paths
DATA_DIR = Path("../input/alvin-smart-money-classification-challenge")

# data
TRAIN_DATA = DATA_DIR / "train.csv"

TEST_DATA = DATA_DIR / "test.csv"

EXTRA_DATA = DATA_DIR / "extra_data.csv"

# columns in the data
INDEX_COL = "Transaction_ID"

TARGET_COL = "MERCHANT_CATEGORIZED_AS"

# random state
RANDOM_STATE = 98765

## Loading the data

In [3]:
dt_cols = ["MERCHANT_CATEGORIZED_AT", "PURCHASED_AT"]
train_df = pd.read_csv(
    TRAIN_DATA, index_col=INDEX_COL, parse_dates=dt_cols
)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 373 entries, ID_04mk78fa to ID_zztijwt3
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype              
---  ------                                 --------------  -----              
 0   MERCHANT_CATEGORIZED_AT                373 non-null    datetime64[ns, UTC]
 1   MERCHANT_NAME                          373 non-null    object             
 2   MERCHANT_CATEGORIZED_AS                373 non-null    object             
 3   PURCHASE_VALUE                         373 non-null    int64              
 4   PURCHASED_AT                           373 non-null    datetime64[ns, UTC]
 5   IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY  373 non-null    bool               
 6   USER_AGE                               61 non-null     float64            
 7   USER_GENDER                            367 non-null    object             
 8   USER_HOUSEHOLD                         373 non-null    int64              
 9

In [4]:
test_df = pd.read_csv(
    TEST_DATA, index_col=INDEX_COL, parse_dates=dt_cols
)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, ID_00x9h2yx to ID_zz7zds1s
Data columns (total 10 columns):
 #   Column                                 Non-Null Count  Dtype              
---  ------                                 --------------  -----              
 0   MERCHANT_CATEGORIZED_AT                558 non-null    datetime64[ns, UTC]
 1   MERCHANT_NAME                          558 non-null    object             
 2   PURCHASE_VALUE                         558 non-null    int64              
 3   PURCHASED_AT                           558 non-null    datetime64[ns, UTC]
 4   IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY  558 non-null    bool               
 5   USER_AGE                               85 non-null     float64            
 6   USER_GENDER                            553 non-null    object             
 7   USER_HOUSEHOLD                         558 non-null    int64              
 8   USER_INCOME                            558 non-null    int64              
 9

In [5]:
extra_df = pd.read_csv(
    EXTRA_DATA, index_col=INDEX_COL, parse_dates=dt_cols
)
extra_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, ID_v13grrw3 to ID_cxhvhlnw
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype              
---  ------                                 --------------  -----              
 0   MERCHANT_CATEGORIZED_AT                0 non-null      datetime64[ns]     
 1   MERCHANT_NAME                          10000 non-null  object             
 2   MERCHANT_CATEGORIZED_AS                0 non-null      float64            
 3   PURCHASE_VALUE                         10000 non-null  int64              
 4   PURCHASED_AT                           10000 non-null  datetime64[ns, UTC]
 5   IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY  10000 non-null  bool               
 6   USER_AGE                               1158 non-null   float64            
 7   USER_GENDER                            9966 non-null   object             
 8   USER_HOUSEHOLD                         10000 non-null  int64              


## Pandas profiling report

In [6]:
from pandas_profiling import ProfileReport

train_profile = ProfileReport(train_df)
train_profile

Summarize dataset:   0%|          | 0/25 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [7]:
test_profile = ProfileReport(test_df)
test_profile

Summarize dataset:   0%|          | 0/24 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [8]:
extra_profile = ProfileReport(extra_df)
extra_profile

Summarize dataset:   0%|          | 0/25 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

