# Overall Analysis of the Data and Preparation
## Assesing Quality + Fixing if neccessary 

In [1]:
# imports 
import pandas as pd

In [2]:
# retrieve data, set index to 0
df = pd.read_excel('../data/raw/psp_raw_data.xlsx', index_col=0)

In [3]:
## initial cleanup
# lower column names
df.rename(columns=str.lower, inplace=True)
# reset index 
df = df.reset_index(drop=True)

In [4]:
# visual check
display(df.head(15))
display(df.tail())

Unnamed: 0,tmsp,country,amount,success,psp,3d_secured,card
0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa
1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa
2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners
3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners
4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners
5,2019-01-01 00:06:41,Switzerland,282,0,UK_Card,0,Master
6,2019-01-01 00:07:19,Switzerland,282,0,Simplecard,0,Master
7,2019-01-01 00:08:46,Germany,117,1,UK_Card,0,Master
8,2019-01-01 00:09:56,Switzerland,174,0,Simplecard,0,Visa
9,2019-01-01 00:10:49,Switzerland,174,0,Simplecard,0,Visa


Unnamed: 0,tmsp,country,amount,success,psp,3d_secured,card
50405,2019-02-28 23:45:39,Switzerland,415,0,UK_Card,0,Visa
50406,2019-02-28 23:46:48,Austria,91,0,UK_Card,0,Master
50407,2019-02-28 23:47:04,Austria,91,0,UK_Card,0,Master
50408,2019-02-28 23:47:36,Austria,91,0,UK_Card,0,Master
50409,2019-02-28 23:48:19,Austria,91,1,Moneycard,0,Master


In [5]:
# check data types
df.dtypes

tmsp          datetime64[ns]
country               object
amount                 int64
success                int64
psp                   object
3d_secured             int64
card                  object
dtype: object

In [6]:
# convert booleans to categorical type (success and 3d_secured)
df['success'] = df['success'].astype('object')
df['3d_secured'] = df['3d_secured'].astype('object')
df.dtypes

tmsp          datetime64[ns]
country               object
amount                 int64
success               object
psp                   object
3d_secured            object
card                  object
dtype: object

In [7]:
# check null values
print(df.isnull().sum())

tmsp          0
country       0
amount        0
success       0
psp           0
3d_secured    0
card          0
dtype: int64


- data quality is 
    - correct values, no missing values, types correct
        - boolean had to be changed, that's db and pandas specific though


In [11]:
# check distinct values and count
for col in df.select_dtypes(include=['object']).columns:
    print(df[col].value_counts())
    print('---------------------------')

country
Germany        30233
Switzerland    10338
Austria         9839
Name: count, dtype: int64
---------------------------
success
0    40182
1    10228
Name: count, dtype: int64
---------------------------
psp
UK_Card       26459
Simplecard    12446
Moneycard      8297
Goldcard       3208
Name: count, dtype: int64
---------------------------
3d_secured
0    38399
1    12011
Name: count, dtype: int64
---------------------------
card
Master    29002
Visa      11640
Diners     9768
Name: count, dtype: int64
---------------------------


In [12]:
# add ratios to get a first feeling of the distribution
for col in df.select_dtypes(include=['object']).columns:
    value_counts = df[col].value_counts()
    percentages = df[col].value_counts(normalize=True) * 100
    result = pd.DataFrame({'count': value_counts, 'percentage': percentages})
    print(result)
    print('---------------------------')

             count  percentage
country                       
Germany      30233   59.974211
Switzerland  10338   20.507836
Austria       9839   19.517953
---------------------------
         count  percentage
success                   
0        40182   79.710375
1        10228   20.289625
---------------------------
            count  percentage
psp                          
UK_Card     26459   52.487602
Simplecard  12446   24.689546
Moneycard    8297   16.459036
Goldcard     3208    6.363817
---------------------------
            count  percentage
3d_secured                   
0           38399   76.173378
1           12011   23.826622
---------------------------
        count  percentage
card                     
Master  29002   57.532236
Visa    11640   23.090657
Diners   9768   19.377108
---------------------------


In [13]:
# check key metrics for numerical (and timestamp)
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
tmsp,50410.0,2019-01-29 16:28:52.923269120,2019-01-01 00:01:11,2019-01-14 12:00:36.249999872,2019-01-30 15:20:07.500000,2019-02-13 01:09:50.500000,2019-02-28 23:48:19,
amount,50410.0,202.395715,6.0,133.0,201.0,269.0,630.0,96.27473


- Full months of January and February
- not taking into account the mean and percentiles for the amount since the records aren't aggregated on purchases but transaction (attempts)
    - min 6, max 630 though 

In [14]:
# save corrected data 
df.to_csv('../data/intermediate/clean_data.csv', index=False)
