In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [4]:
froot = "C:/Users/verdi/Documents/Datasets/IEEEFraudDetection/ieee-fraud-detection/"
transaction_fname = "train_transaction.csv"
id_fname = "train_identity.csv"
transaction_df = pd.read_csv(froot + transaction_fname)
identity_df = pd.read_csv(froot + id_fname)
df = transaction_df.merge(identity_df, how="left", on="TransactionID")
df.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## Data types

In [5]:
num_df = df.select_dtypes(include=np.number)
cat_df = df.select_dtypes(exclude=np.number)
cat_cols = cat_df.columns
num_cols = num_df.columns

print("Number of numerical columns: %d" % len(num_cols))
print("Number of categorical columns: %d" % len(cat_cols))
print("Categorical columns: {}".format(cat_cols))

Number of numerical columns: 403
Number of categorical columns: 31
Categorical columns: Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')


## Invalid values
- Numerical attributes
- Categorical attributes

In [6]:
# num_nan_cols = df.columns[df.isna().any()]
# nan_ratio = len(num_nan_cols) / len(num_cols)
# print("There are %d numerical columns with missing values (%2.4f of numerical columns)" % (len(num_nan_cols), nan_ratio))

# nan_cols_sum_series = df.isna().sum().sort_values(ascending=False)
# nan_cols_ratio_series = (df.isna().sum() / len(df)).sort_values(ascending=False)
# nan_cols_df = pd.DataFrame(
#     pd.concat((nan_cols_sum_series, nan_cols_ratio_series), axis=1),
# )
# nan_cols_df.style.to_latex("export/table/nan_col_series.tex")
# nan_cols_df.head(20)

In [7]:
nan_cols = df.columns[df.isna().any()]
nan_ratio = len(nan_cols) / len(df.columns)
print("There are %d columns with missing values (%2.4f of all columns)" % (len(nan_cols), nan_ratio))

nan_cols_sum_series = df.isna().sum().sort_values(ascending=False)
nan_cols_ratio_series = (df.isna().sum() / len(df)).sort_values(ascending=False)
nan_cols_df = pd.DataFrame(
    pd.concat((nan_cols_sum_series, nan_cols_ratio_series), axis=1),
)
nan_cols_df.style.to_latex("export/table/nan_col_series.tex")
nan_cols_df.head(20)

There are 414 columns with missing values (0.9539 of all columns)


Unnamed: 0,0,1
id_24,585793,0.991962
id_25,585408,0.99131
id_07,585385,0.991271
id_08,585385,0.991271
id_21,585381,0.991264
id_26,585377,0.991257
id_27,585371,0.991247
id_23,585371,0.991247
id_22,585371,0.991247
dist2,552913,0.936284


In [8]:
# # Vérifier parmi les valeurs manquantes si celles-ci sont exclusivement associées à des transactions
# # légitimes ou frauduleuses
# data = defaultdict()
# for col in nan_cols_ratio_series[nan_cols_ratio_series > 0.].index:
#     data[col] = {"isFraud==0": 0, "isFraud==1": 0}
#     data[col]["isFraud==0"] = df[df["isFraud"] == 0][col].isna().sum()
#     data[col]["isFraud==1"] = df[df["isFraud"] == 1][col].isna().sum()

# nan_count_per_cls = pd.DataFrame(data)
# nan_count_per_cls.to_csv("export/table/is_fraud_count_nan_cols.csv")
# nan_count_per_cls

In [9]:
# Colonnes avec 1 ou plusieurs valeurs nulles
nan_cols = df.columns[df.isna().sum() > 0]
# Colonnes ayant le même nombre de valeurs manquantes
dup_cols = df[nan_cols].columns[[df[nan_cols].isna().sum().duplicated()]]
# Colonnes obtenues à l'étape précédente dont le nombre de valeurs négatives ne dépasse pas 1% des données
cols = df[dup_cols].columns[df[dup_cols].isna().sum() < (.01 * len(df))]
# Suppression des valeurs manquantes
len_before = len(df)
df.dropna(inplace=True, subset=cols)
# Validation
assert df[cols].isna().sum().sum() == 0., "Il reste des valeurs manquantes"
assert df.isna().sum().sum() > 0., "Mauvaise assignation: il devrait rester des valeurs manquantes dans le reste des données"
diff = len_before - len(df)
print("Rows deleted: {}\nRatio: {:2.4f}".format(diff, diff / len_before))

  result = getitem(key)


Rows deleted: 1441
Ratio: 0.0024


In [23]:
cols_above_thresh = df.columns[(df.isna().sum() / len(df)) > 0.5]

Index(['dist1', 'dist2', 'R_emaildomain', 'D5', 'D6', 'D7', 'D8', 'D9', 'D12',
       'D13',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=214)

In [32]:
# Fonction grandement inspirée de https://www.kaggle.com/code/davidcairuz/feature-engineering-lightgbm
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0].str.split(' ', expand=True)[0]
    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    
    return dataframe

In [33]:
id_split(df)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,DeviceInfo,device_name,device_version,OS_id_30,version_id_30,browser_id_31,version_id_31,screen_width,screen_height,had_id
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,1
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,1
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,1
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,1
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,SAMSUNG SM-G892A Build/NRD90M,Samsung,NRD90M,Android,7.0,samsung,browser,2220,1080,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,1
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,1
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,1
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,1


## Unique values

In [11]:
num_df.columns[num_df.nunique() == 1]


Index([], dtype='object')