In [1]:
from IPython.display import display
from ml_tools import SimpleAggregator, Hour, SingleAggregator, custom_generator, Average
import pandas as pd
import seaborn as sns

# Transforming features

In [2]:
single_dataset = pd.read_csv("data/ml_tools_single_table_dataset/fraud.csv")
display(single_dataset.sample(10))
## too big to run quickly (also unnecessary for the demo)
single_dataset_trunc = single_dataset.sample(200)

Unnamed: 0,id,card_id,store_id,datetime,amount,currency,customer_present,provider,lat,lng,region,country,fraud
40201,40201,2807,2706,2019-01-19 10:33:39,78134,LTL,False,VISA 19 digit,40.1675,34.37389,Sungurlu,TR,False
58690,58690,20741,9220,2019-05-27 20:39:31,47113,LTL,True,Mastercard,-2.90055,-79.00453,Cuenca,EC,True
65959,65959,21790,499,2019-04-26 23:12:27,22353,HKD,False,JCB 16 digit,42.52787,-70.92866,Peabody,US,False
38881,38881,7193,9214,2019-03-31 13:11:10,29794,EGP,False,Discover,13.44581,101.18445,Phanat Nikhom,TH,False
25523,25523,2163,5336,2019-02-07 08:53:45,40237,PYG,False,American Express,44.99012,-123.02621,Keizer,US,False
31228,31228,7391,7391,2019-07-06 10:46:15,15969,CZK,True,Mastercard,-1.63333,13.58357,Franceville,GA,False
79243,79243,26955,1067,2019-07-22 04:30:39,14844,BSD,False,VISA 19 digit,54.60528,18.34717,Reda,PL,False
70705,70705,11994,561,2019-07-22 13:41:17,69742,KZT,True,Mastercard,15.73628,75.96976,Gajendragarh,IN,False
57522,57522,18178,3288,2019-02-08 14:04:39,38678,SLL,False,VISA 19 digit,48.49144,9.20427,Reutlingen,DE,False
15037,15037,16970,1860,2019-01-15 13:51:21,62343,JEP,False,VISA 13 digit,41.27194,123.17306,Liaoyang,CN,False


In [3]:
single_dataset_trunc.datetime = pd.to_datetime(single_dataset_trunc.datetime)
single_dataset_hours = Hour.generate_feature(single_dataset_trunc.copy())
display(single_dataset_hours.sample(10))

[3]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.iloc[:, idx][index] = time.hour


Unnamed: 0,id,card_id,store_id,datetime,amount,currency,customer_present,provider,lat,lng,region,country,fraud
42307,42307,14982,1825,12,84956,PKR,False,Mastercard,48.77644,2.29026,Sceaux,FR,False
92897,92897,15301,7563,15,20365,KES,False,JCB 16 digit,53.59337,9.47629,Stade,DE,False
95727,95727,4107,4107,2,98044,MXN,False,VISA 13 digit,42.0,21.32778,Saraj,MK,True
86646,86646,11250,549,12,28056,CUC,False,VISA 16 digit,47.4943,-122.24092,Bryn Mawr-Skyway,US,False
32819,32819,29816,2136,8,30363,KHR,True,American Express,43.78956,7.60872,Ventimiglia,IT,False
5884,5884,17073,8934,17,56798,MDL,False,JCB 16 digit,43.54072,-116.56346,Nampa,US,False
77499,77499,9314,6882,21,52325,CNY,False,Diners Club / Carte Blanche,10.13361,124.84472,Maasin,PH,False
92517,92517,7778,7778,22,57595,TVD,True,Diners Club / Carte Blanche,3.11988,35.59642,Lodwar,KE,False
56611,56611,24870,3206,14,65543,GNF,True,JCB 15 digit,46.32313,-0.45877,Niort,FR,False
33906,33906,9724,9724,2,37848,BAM,False,Diners Club / Carte Blanche,39.32288,-76.72803,Woodlawn,US,True


# Aggregating data
- Across multiple dataframes

In [4]:
transactions = pd.read_csv("data/ml_tools_multiiple_tables_dataset/transactions.csv")
products = pd.read_csv("data/ml_tools_multiiple_tables_dataset/products.csv")

tp_agg = SimpleAggregator(transactions,
                          products,
                          label1='transactions',
                          label2='products')
tp_agg.new_relationship('product_id', 'product_id')
print(tp_agg.relationships)

agg = tp_agg.aggregate()
display(agg)


 RELATIONSHIPS:
transactions.product_id -> products.product_id



Unnamed: 0,product_id,count
0,4,106
1,5,104
2,1,102
3,3,96
4,2,92


# Aggregating data
- In one dataframe

In [5]:
single_agg = SingleAggregator(single_dataset, 'fraud', 'currency')
currency_agg = single_agg.aggregate()

display(currency_agg)


Unnamed: 0,currency,count
0,TJS,677
1,CAD,667
2,VUV,667
3,NAD,666
4,CZK,664
...,...,...
159,SLL,557
160,LTL,556
161,PAB,542
162,AUD,541


# Custom feature generators

In [6]:
def square_column(data, column):
    data[column+'_squared'] = data[column] * data[column]
    return data

cust_generator = custom_generator(square_column)

new_data = cust_generator.generate_feature(single_dataset_trunc.copy(), 'amount')

display(new_data)

Unnamed: 0,id,card_id,store_id,datetime,amount,currency,customer_present,provider,lat,lng,region,country,fraud,amount_squared
39382,39382,9070,5976,2019-04-30 14:57:35,21229,SCR,False,JCB 15 digit,36.02506,-86.77917,Brentwood Estates,US,False,450670441
45348,45348,29776,7172,2019-06-24 03:08:05,17410,ILS,True,JCB 15 digit,12.12770,78.15794,Dharmapuri,IN,False,303108100
65247,65247,26345,4021,2019-05-07 17:29:37,45881,BSD,True,Mastercard,34.31000,-2.16000,Jerada,MA,False,2105066161
51107,51107,10843,4922,2019-07-25 02:27:40,29776,JOD,False,JCB 16 digit,40.63316,-74.13653,Port Richmond,US,False,886610176
71414,71414,7944,6229,2019-07-20 20:32:24,87390,CVE,False,Mastercard,24.81757,84.63445,Rafiganj,IN,False,7637012100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66186,66186,2481,2481,2019-04-15 06:02:14,42906,ZMW,False,Maestro,46.32313,-0.45877,Niort,FR,False,1840924836
44059,44059,18976,4859,2019-04-02 22:03:09,92661,KGS,False,VISA 16 digit,27.85000,112.90000,Xiangtan,CN,True,8586060921
81931,81931,3076,7182,2019-07-21 01:17:13,10664,BGN,True,JCB 16 digit,20.28527,-103.42897,Jocotepec,MX,False,113720896
55244,55244,9217,3620,2019-07-22 14:10:37,43417,SAR,False,American Express,6.25947,102.05461,Tak Bai,TH,False,1885035889


# Average aggregator

In [7]:
iris = sns.load_dataset('iris')
key = 'species'

avg_agg = Average(iris, key, label='iris')

averages = avg_agg.aggregate()
display(averages)

Unnamed: 0,species,count,sepal_length_avg,sepal_width_avg,petal_length_avg,petal_width_avg
0,setosa,50,5.006,3.428,1.462,0.246
1,versicolor,50,5.936,2.77,4.26,1.326
2,virginica,50,6.588,2.974,5.552,2.026
