## Imports and Global Settings

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 500)

## Load Data

In [2]:
original_data = pd.read_json('../data/clean_data.json', orient='records', lines=True)
data = original_data.copy()

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11469 entries, 0 to 11468
Data columns (total 42 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   approx_payout_date  11469 non-null  int64  
 1   body_length         11469 non-null  int64  
 2   channels            11469 non-null  int64  
 3   country             11469 non-null  object 
 4   currency            11469 non-null  object 
 5   delivery_method     11469 non-null  int64  
 6   description         11469 non-null  object 
 7   email_domain        11469 non-null  object 
 8   event_created       11469 non-null  int64  
 9   event_end           11469 non-null  int64  
 10  event_start         11469 non-null  int64  
 11  fb_published        11469 non-null  int64  
 12  gts                 11469 non-null  float64
 13  has_analytics       11469 non-null  int64  
 14  has_header          11469 non-null  int64  
 15  has_logo            11469 non-null  int64  
 16  list

In [4]:
data.head()

Unnamed: 0,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_start,fb_published,gts,has_analytics,has_header,has_logo,listed,name,name_length,num_order,num_payouts,object_id,org_desc,org_facebook,org_name,org_twitter,payee_name,payout_type,previous_payouts,sale_duration,sale_duration2,show_map,ticket_types,user_age,user_created,user_type,venue_country,venue_latitude,venue_longitude,venue_name,venue_state,is_fraud
0,1355097600000,43227,8,CA,CAD,1,"<p class=""MsoNormal""><span style=""color: #3333...",cedec.ca,1351629944000,1354665600000,1354654800000,0,85.29,0,0,1,1,Ten Follow-Up Mistakes Your Competition Loves,46,5,31,4710425,<p><strong>CEDEC Small Business Support Networ...,9,CEDEC Small Business Support Network,9,Unknown,ACH,"[{'name': '', 'created': '2011-01-13 00:00:00'...",35,35,1,"[{'event_id': 4710425, 'cost': 13.11, 'availab...",722,1289255732000,3,CA,45.047497,-73.584455,Café Hemmingford,Quebec,False
1,1334775600000,664,8,US,USD,1,Please join us for the IIDA Student Career Day...,yahoo.com,1328839445000,1334343600000,1334340000000,0,50.0,0,-1,0,1,Student Career Day 2012: Speed Mentoring - 11A...,57,8,6,2930589,<p>The IIDA Cleveland Akron City Center connec...,53,Cleveland Akron IIDA City Center,0,Unknown,ACH,"[{'name': '', 'created': '2011-10-25 01:11:37'...",61,64,1,"[{'event_id': 2930589, 'cost': 0.0, 'availabil...",205,1311116195000,3,US,41.499465,-81.682202,Ohio Desk,OH,False
2,1380684600000,923,11,US,USD,0,"<p style=""text-align: justify;"" align=""LEFT"">&...",projectgradli.org,1374514499000,1380252600000,1380243600000,0,9550.0,0,-1,1,1,DREAMING UNDER THE STARS,24,5,0,7540419,"<p style=""text-align: justify;"" align=""LEFT""><...",0,Project GRAD Long island,0,Unknown,ACH,"[{'name': '', 'created': '2013-10-01 03:13:01'...",66,66,1,"[{'event_id': 7540419, 'cost': 0.0, 'availabil...",0,1374514498000,1,US,40.728012,-73.60193,The Cradle of Aviation Museum,NY,False
3,1362142800000,4417,11,IE,EUR,0,"<p><strong>&nbsp;<span style=""font-family: hel...",gmail.com,1360608512000,1361710800000,1361680200000,0,1813.36,0,-1,1,0,King of Ping,12,51,1,5481976,<p>Mabos is a multi-purpose art space with a m...,27,mabos,11,Unknown,ACH,"[{'name': '', 'created': '2013-01-05 03:14:55'...",12,12,1,"[{'event_id': 5481976, 'cost': 16.74, 'availab...",50,1356308239000,3,IE,53.343803,-6.232139,mabos,County Dublin,False
4,1358746200000,2505,8,US,USD,1,"<p style=""text-align: center;""><strong>ONLY ON...",gmail.com,1353197931000,1358314200000,1358308800000,0,105.44,0,0,1,1,Everyone Communicates Few Connect MasterMind G...,50,1,1,4851467,"<p><span style=""font-family: Arial, Helvetica,...",14,"Kim D. Moore - Coach, Teacher and Speaker with...",10,Kim D. Moore,CHECK,"[{'name': 'Kim D. Moore', 'created': '2012-10-...",57,59,0,"[{'event_id': 4851467, 'cost': 99.0, 'availabi...",1029,1264268063000,4,Unknown,-1.0,-1.0,Unknown,Unknown,False


## Explore and Expand Nested Features

Aggregating data from within previous_payouts.

In [5]:
def payouts_sum(x):
    total = 0
    for payout in x:
        total += payout['amount']
    return total

data['num_previous_payouts'] = data['previous_payouts'].apply(lambda x: len(x))
data['previous_payouts_total'] = data['previous_payouts'].apply(payouts_sum)

data = data.drop(columns=['previous_payouts'])

Aggregating data from ticket_types.

In [6]:
def get_tickets_available(x):
    total = 0
    for ticket_type in x:
        total += ticket_type['quantity_total']
    return total

def get_total_ticket_value(x):
    total = 0
    for ticket_type in x:
        total += (ticket_type['quantity_total'] * ticket_type['cost'])
    return total

data['num_ticket_types'] = data['ticket_types'].apply(lambda x: len(x))
data['num_tickets_available'] = data['ticket_types'].apply(get_tickets_available)
data['total_ticket_value'] = data['ticket_types'].apply(get_total_ticket_value)
data['avg_ticket_cost'] = data['total_ticket_value'] / data['num_tickets_available']

data = data.drop(columns=['ticket_types'])

## Composite Features from Datetimes

In [7]:
datetime_features = [
        "approx_payout_date",
        "event_created",
        "event_end",
        "event_start",
        "user_created",
    ]
for feature in datetime_features:
    data[feature] = pd.to_datetime(data[feature], unit="ms")

In [8]:
data['days_from_event_created_till_start'] = (data['event_start'] - data['event_created']).dt.days

In [9]:
datetime_features = ['approx_payout_date', 'event_created', 'event_end',
                     'event_start', 'user_created']
data = data.drop(columns=datetime_features)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11469 entries, 0 to 11468
Data columns (total 42 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   body_length                         11469 non-null  int64  
 1   channels                            11469 non-null  int64  
 2   country                             11469 non-null  object 
 3   currency                            11469 non-null  object 
 4   delivery_method                     11469 non-null  int64  
 5   description                         11469 non-null  object 
 6   email_domain                        11469 non-null  object 
 7   fb_published                        11469 non-null  int64  
 8   gts                                 11469 non-null  float64
 9   has_analytics                       11469 non-null  int64  
 10  has_header                          11469 non-null  int64  
 11  has_logo                            11469

## Categorical Features