In [1]:
from IPython.display import display
from ml_tools import SimpleAggregator, Hour, SingleAggregator, custom_generator, Average
import pandas as pd
import seaborn as sns

# Transforming features

In [2]:
single_dataset = pd.read_csv("data/ml_tools_single_table_dataset/fraud.csv")
display(single_dataset.sample(10))
## too big to run quickly (also unnecessary for the demo)
single_dataset_trunc = single_dataset.sample(200)

Unnamed: 0,id,card_id,store_id,datetime,amount,currency,customer_present,provider,lat,lng,region,country,fraud
18771,18771,13423,8451,2019-05-09 07:29:18,99832,LTL,True,Diners Club / Carte Blanche,25.66795,85.83636,Dalsingh Sarai,IN,True
73272,73272,7358,5913,2019-03-20 00:04:31,36385,MXN,True,Mastercard,4.02219,101.02083,Teluk Intan,MY,False
8973,8973,12063,2658,2019-06-07 12:23:37,25165,AFN,True,American Express,19.32889,-99.32556,San Lorenzo Acopilco,MX,False
78258,78258,10833,4576,2019-01-15 23:50:23,12267,INR,False,JCB 15 digit,50.8,3.16667,Wevelgem,BE,False
84286,84286,19007,8194,2019-05-30 00:48:04,98233,NOK,True,VISA 13 digit,22.4711,88.1453,Pujali,IN,True
36160,36160,20311,5746,2019-02-18 06:30:23,84044,BWP,True,JCB 15 digit,50.75,2.25,Saint-Omer,FR,False
42071,42071,6843,6843,2019-02-02 08:16:56,58922,KHR,False,Diners Club / Carte Blanche,22.83957,91.84128,Manikchari,BD,True
12549,12549,27316,4758,2019-02-07 13:23:34,44684,PGK,True,JCB 15 digit,57.47908,-4.22398,Inverness,GB,False
77872,77872,4050,9094,2019-01-09 19:09:34,25218,SGD,False,Diners Club / Carte Blanche,13.70167,-89.10944,Ilopango,SV,False
47993,47993,3753,3753,2019-04-21 04:16:25,59893,BBD,True,VISA 13 digit,46.32313,-0.45877,Niort,FR,False


In [3]:
single_dataset_trunc.datetime = pd.to_datetime(single_dataset_trunc.datetime)
single_dataset_hours = Hour.generate_feature(single_dataset_trunc.copy())
display(single_dataset_hours.sample(10))

[3]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.iloc[:, idx][index] = time.hour


Unnamed: 0,id,card_id,store_id,datetime,amount,currency,customer_present,provider,lat,lng,region,country,fraud
74377,74377,25255,4676,1,40006,PGK,False,JCB 16 digit,3.8801,-77.03116,Buenaventura,CO,False
71908,71908,3532,3857,14,56445,NIS,False,Mastercard,37.05944,37.3825,Gaziantep,TR,False
71178,71178,33100,6896,11,35154,UZS,False,Diners Club / Carte Blanche,28.71271,77.656,Pilkhua,IN,False
42523,42523,2593,1783,15,13483,GMD,False,Diners Club / Carte Blanche,42.24113,-88.3162,Crystal Lake,US,False
71921,71921,21280,1934,8,31252,DKK,True,VISA 16 digit,40.93121,-73.89875,Yonkers,US,False
88794,88794,11847,9826,19,99655,ISK,True,Diners Club / Carte Blanche,36.19278,117.65694,Laiwu,CN,True
66360,66360,11887,3941,22,9631,IDR,True,VISA 16 digit,37.05944,37.3825,Gaziantep,TR,False
65793,65793,11944,2948,22,18523,SEK,False,JCB 16 digit,20.21322,-100.88023,Salvatierra,MX,False
96279,96279,32144,9765,16,63545,SAR,True,Discover,54.8421,46.5813,Alatyr‚Äô,RU,False
56982,56982,32068,6570,6,18316,GMD,True,Maestro,12.74482,4.52514,Argungu,NG,False


# Aggregating data
- Across multiple dataframes

In [4]:
transactions = pd.read_csv("data/ml_tools_multiiple_tables_dataset/transactions.csv")
products = pd.read_csv("data/ml_tools_multiiple_tables_dataset/products.csv")

tp_agg = SimpleAggregator(transactions,
                          products,
                          label1='transactions',
                          label2='products')
tp_agg.new_relationship('product_id', 'product_id')
print(tp_agg.relationships)

agg = tp_agg.aggregate()
display(agg)


 RELATIONSHIPS:
transactions.product_id -> products.product_id



Unnamed: 0,product_id,count
0,4,106
1,5,104
2,1,102
3,3,96
4,2,92


# Aggregating data
- In one dataframe

In [5]:
single_agg = SingleAggregator(single_dataset, 'fraud', 'currency')
currency_agg = single_agg.aggregate()

display(currency_agg)


Unnamed: 0,currency,count
0,TJS,677
1,VUV,667
2,CAD,667
3,NAD,666
4,CZK,664
...,...,...
159,SLL,557
160,LTL,556
161,PAB,542
162,AUD,541


# Custom feature generators

In [8]:
def square_column(data, column):
    data[column+'_squared'] = data[column] * data[column]
    return data

cust_generator = custom_generator(square_column,
                                  name='squared',
                                  feature_type='polynomial')

new_data = cust_generator.generate_feature(single_dataset_trunc.copy(), 'amount')

print("name: ", cust_generator.name)
print("feature_type: ", cust_generator.feature_type)
display(new_data)

name:  squared
feature_type:  polynomial


Unnamed: 0,id,card_id,store_id,datetime,amount,currency,customer_present,provider,lat,lng,region,country,fraud,amount_squared
79788,79788,33268,8811,2019-03-30 03:49:55,11652,MWK,False,Discover,38.38479,-0.76773,Novelda,ES,False,135769104
53045,53045,21910,4725,2019-04-20 15:45:09,72120,AZN,True,JCB 15 digit,53.59337,9.47629,Stade,DE,False,5201294400
11391,11391,18037,4994,2019-05-30 19:55:54,48065,IRR,True,VISA 19 digit,22.98212,-80.58556,Corralillo,CU,False,2310244225
87348,87348,12254,5799,2019-04-16 01:36:48,7970,SOS,False,Mastercard,49.88307,-119.48568,Kelowna,CA,False,63520900
28473,28473,28234,6008,2019-07-20 11:19:14,85404,MXN,True,Maestro,-3.14306,-58.44417,Itacoatiara,BR,True,7293843216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31703,31703,29107,756,2019-01-27 20:22:07,61417,LRD,False,VISA 16 digit,20.88098,75.11937,Parola,IN,False,3772047889
42523,42523,2593,1783,2019-01-13 15:22:17,13483,GMD,False,Diners Club / Carte Blanche,42.24113,-88.31620,Crystal Lake,US,False,181791289
66500,66500,27280,3973,2019-01-20 01:59:13,68980,BOB,True,Mastercard,40.98894,28.67582,Yakuplu,TR,False,4758240400
12741,12741,19221,3755,2019-04-04 19:43:23,42913,GTQ,False,Discover,3.27833,32.88667,Kitgum,UG,False,1841525569


# Average aggregator

In [None]:
iris = sns.load_dataset('iris')
key = 'species'

avg_agg = Average(iris, key, label='iris')

averages = avg_agg.aggregate()
display(averages)