In [1]:
%matplotlib notebook
import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns  # data visualization library  
import matplotlib.pyplot as plt

## Merge datasets

In [2]:
folder_path = './'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sub = pd.read_csv(f'{folder_path}sample_submission.csv')
# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [3]:
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

Train dataset has 590540 rows and 434 columns.
Test dataset has 506691 rows and 433 columns.


In [4]:
# delete useless variables
del train_identity, train_transaction, test_identity, test_transaction

## Drop the columns with 85% or more null values
The transaction table has more cases of missing data. 55 of the 394 features have more than 80% missing data, and 113 features have missing data between 70% and 80%. Similar missing data patterns were found among features which have consecutive names. Specifically, the missing data rates of "D6"-"D9" and "D12"-"D14" were all above 87.3122%. The missing data rate of "D6" - "D9" and "D12" - "D14" was above 87.3122%. The missing data rate for "V138" - "V166" is between 86.1227% and 86.1237%. All features from "V323" to "V339" have missing data rate of 86.054967%. This regularity exhibited in the missing data suggests that there may have a strong correlation between these consecutive numerically arranged features, although the data provider does not explain the specific meaning expressed by these encrypted features.

In [5]:
list_drop_89 = []
for i in list(range(6,10)):
    add ="D" + str(i)
    #print(add)
    list_drop_89.append(add)
for i in list(range(12,15)):
    add ="D" + str(i)
    #print(add)
    list_drop_89.append(add)
print("list_drop_89: ",list_drop_89)

list_drop_86 = []
for i in list(range(138,167)):
    add ="V" + str(i) 
    #print(add)
    list_drop_86.append(add)
print("list_drop_86: ",list_drop_86)

list_drop_85 = []
for i in list(range(323,340)):
    add ="V" + str(i) 
    #print(add)
    list_drop_85.append(add)
print("list_drop_85: ",list_drop_85)

list_drop_89:  ['D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14']
list_drop_86:  ['V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166']
list_drop_85:  ['V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339']


In [7]:
droplist= list_drop_89 + list_drop_86 + list_drop_85
print(droplist)

['D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339']


In [8]:
# delete useless variables
del list_drop_89, list_drop_86, list_drop_85

In [9]:
train.drop(droplist, 1,inplace = True)
test.drop(droplist, 1,inplace = True)
print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

Train dataset has 590540 rows and 381 columns.
Test dataset has 506691 rows and 380 columns.


## clean up the features based on their email domain (“P_emaildomain” and “R_emaildomain”)
we need to clean up the features based on their email domain (“P_emaildomain” and “R_emaildomain”). Because some of the email domain will separate from the different countries. For example, “yahoo.co.jp”, “yahoo.de”, “yahoo.fr”. What we need here is just the information of email provider. So, all three records above will be converted to “yahoo”

In [59]:
for col in ['P_emaildomain','R_emaildomain']:
    #For train
    train.loc[train[col].isin(['yahoo.co.jp','yahoo.co.uk', 'yahoo.com', 'yahoo.com.mx', 'yahoo.de', 'yahoo.es',
       'yahoo.fr', 'ymail.com']),col] = "Yahoo"
    train.loc[train[col].isin(['gmail','gmail.com']),col] = "Gmail"
    train.loc[train[col].isin(['hotmail.co.uk','hotmail.com','hotmail.de', 'hotmail.es', 'hotmail.fr']),col] = "Hotmail"
    train.loc[train[col].isin(['live.com', 'live.com.mx', 'live.fr']),col] = "Live"
    train.loc[train[col].isin(['outlook.com', 'outlook.es']),col] = "Outlook"
    train.loc[train[col].isin(['netzero.com', 'netzero.net']),col] = "Netzero"
    train.loc[train[col].isin(['frontier.com', 'frontiernet.net']),col] = "Frontiernet"
    # For test
    test.loc[test[col].isin(['yahoo.co.jp','yahoo.co.uk', 'yahoo.com', 'yahoo.com.mx', 'yahoo.de', 'yahoo.es',
       'yahoo.fr', 'ymail.com']),col] = "Yahoo"
    test.loc[test[col].isin(['gmail','gmail.com']),col] = "Gmail"
    test.loc[test[col].isin(['hotmail.co.uk','hotmail.com','hotmail.de', 'hotmail.es', 'hotmail.fr']),col] = "Hotmail"
    test.loc[test[col].isin(['live.com', 'live.com.mx', 'live.fr']),col] = "Live"
    test.loc[test[col].isin(['outlook.com', 'outlook.es']),col] = "Outlook"
    test.loc[test[col].isin(['netzero.com', 'netzero.net']),col] = "Netzero"
    test.loc[test[col].isin(['frontier.com', 'frontiernet.net']),col] = "Frontiernet"


In [60]:
#check
p = train['P_emaildomain'].unique()
p_new = p[~pd.isnull(p)]
p_new.sort()
p_new

array(['Frontiernet', 'Gmail', 'Hotmail', 'Live', 'Netzero', 'Outlook',
       'Yahoo', 'aim.com', 'anonymous.com', 'aol.com', 'att.net',
       'bellsouth.net', 'cableone.net', 'centurylink.net', 'cfl.rr.com',
       'charter.net', 'comcast.net', 'cox.net', 'earthlink.net',
       'embarqmail.com', 'gmx.de', 'icloud.com', 'juno.com', 'mac.com',
       'mail.com', 'me.com', 'msn.com', 'optonline.net', 'prodigy.net.mx',
       'protonmail.com', 'ptd.net', 'q.com', 'roadrunner.com',
       'rocketmail.com', 'sbcglobal.net', 'sc.rr.com', 'servicios-ta.com',
       'suddenlink.net', 'twc.com', 'verizon.net', 'web.de',
       'windstream.net'], dtype=object)

In [61]:
#check
p = test['P_emaildomain'].unique()
p_new = p[~pd.isnull(p)]
p_new.sort()
p_new

array(['Frontiernet', 'Gmail', 'Hotmail', 'Live', 'Netzero', 'Outlook',
       'Yahoo', 'aim.com', 'anonymous.com', 'aol.com', 'att.net',
       'bellsouth.net', 'cableone.net', 'centurylink.net', 'cfl.rr.com',
       'charter.net', 'comcast.net', 'cox.net', 'earthlink.net',
       'embarqmail.com', 'gmx.de', 'icloud.com', 'juno.com', 'mac.com',
       'mail.com', 'me.com', 'msn.com', 'optonline.net', 'prodigy.net.mx',
       'protonmail.com', 'ptd.net', 'q.com', 'roadrunner.com',
       'rocketmail.com', 'sbcglobal.net', 'sc.rr.com', 'scranton.edu',
       'servicios-ta.com', 'suddenlink.net', 'twc.com', 'verizon.net',
       'web.de', 'windstream.net'], dtype=object)

In [62]:
#check
p = train['R_emaildomain'].unique()
p_new = p[~pd.isnull(p)]
p_new.sort()
p_new

array(['Frontiernet', 'Gmail', 'Hotmail', 'Live', 'Netzero', 'Outlook',
       'Yahoo', 'aim.com', 'anonymous.com', 'aol.com', 'att.net',
       'bellsouth.net', 'cableone.net', 'centurylink.net', 'cfl.rr.com',
       'charter.net', 'comcast.net', 'cox.net', 'earthlink.net',
       'embarqmail.com', 'gmx.de', 'icloud.com', 'juno.com', 'mac.com',
       'mail.com', 'me.com', 'msn.com', 'optonline.net', 'prodigy.net.mx',
       'protonmail.com', 'ptd.net', 'q.com', 'roadrunner.com',
       'rocketmail.com', 'sbcglobal.net', 'sc.rr.com', 'scranton.edu',
       'servicios-ta.com', 'suddenlink.net', 'twc.com', 'verizon.net',
       'web.de', 'windstream.net'], dtype=object)

In [63]:
#check
p = test['R_emaildomain'].unique()
p_new = p[~pd.isnull(p)]
p_new.sort()
p_new

array(['Frontiernet', 'Gmail', 'Hotmail', 'Live', 'Netzero', 'Outlook',
       'Yahoo', 'aim.com', 'anonymous.com', 'aol.com', 'att.net',
       'bellsouth.net', 'cableone.net', 'centurylink.net', 'cfl.rr.com',
       'charter.net', 'comcast.net', 'cox.net', 'earthlink.net',
       'embarqmail.com', 'gmx.de', 'icloud.com', 'juno.com', 'mac.com',
       'mail.com', 'me.com', 'msn.com', 'optonline.net', 'prodigy.net.mx',
       'protonmail.com', 'ptd.net', 'q.com', 'roadrunner.com',
       'rocketmail.com', 'sbcglobal.net', 'sc.rr.com', 'scranton.edu',
       'servicios-ta.com', 'suddenlink.net', 'twc.com', 'verizon.net',
       'web.de', 'windstream.net'], dtype=object)

References: 
https://www.kaggle.com/code/artgor/eda-and-models