In [1]:
import numpy as np
import pandas as pd

# Analyse élémentaire
Avoir une compréhension des données avant de les traiter.
## Chargement des données

In [2]:
froot = "C:/Users/verdi/Documents/Datasets/IEEEFraudDetection/ieee-fraud-detection/"
transaction_fname = "train_transaction.csv"
id_fname = "train_identity.csv"
transaction_df = pd.read_csv(froot + transaction_fname)
identity_df = pd.read_csv(froot + id_fname)
df = transaction_df.merge(identity_df, how="left", on="TransactionID")
target = df["isFraud"]
df.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## Types des données
Les données comprennent majoritairement des attributs numériques, mais contient aussi de nombreux attributs catégoriques.

In [3]:
num_df = df.select_dtypes(include=np.number)
cat_df = df.select_dtypes(exclude=np.number)
cat_cols = cat_df.columns
num_cols = num_df.columns

print("Number of numerical columns: %d" % len(num_cols))
print("Number of categorical columns: %d" % len(cat_cols))
print("Categorical columns: {}".format(cat_cols))

Number of numerical columns: 403
Number of categorical columns: 31
Categorical columns: Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')


## Valeurs invalides
Les données sont majoritairement non définies (NaN).

In [4]:
# Données avec des valeurs uniques
# uniq_cols = df[df.nunique() <= 2].columns
# uniq_cols
uniq_cols = list(set(df.columns[df.nunique() <= 2]) - {"isFraud"})
df = df.drop(uniq_cols, axis=1)
assert len(set(df.columns[df.nunique() <= 2]) - {"isFraud"}) == 0, "there are still attributs with unique values"
print("Dropped {} attributes with unique values".format(len(uniq_cols)))
uniq_cols

Dropped 25 attributes with unique values


['M8',
 'M6',
 'id_38',
 'id_37',
 'V88',
 'M2',
 'id_35',
 'id_36',
 'V1',
 'V14',
 'id_16',
 'M3',
 'V41',
 'V305',
 'V65',
 'M9',
 'V107',
 'DeviceType',
 'id_28',
 'id_12',
 'id_29',
 'id_27',
 'M5',
 'M1',
 'M7']

## Valeurs manquantes ($NaN$)
Les données sont majoritairement polluées par des valeurs manquantes.

In [5]:
nan_cols = df.columns[df.isna().any()]
nan_ratio = len(nan_cols) / len(df.columns)
print("There are %d columns with missing values (%2.4f of all columns)" % (len(nan_cols), nan_ratio))

nan_cols_sum_series = df.isna().sum().sort_values(ascending=False)
nan_cols_ratio_series = (df.isna().sum() / len(df)).sort_values(ascending=False)
nan_cols_df = pd.DataFrame(
    pd.concat((nan_cols_sum_series, nan_cols_ratio_series), axis=1),
)
nan_cols_df.style.to_latex("export/table/nan_col_series.tex")
nan_cols_df.head(20)

There are 389 columns with missing values (0.9511 of all columns)


Unnamed: 0,0,1
id_24,585793,0.991962
id_25,585408,0.99131
id_08,585385,0.991271
id_07,585385,0.991271
id_21,585381,0.991264
id_26,585377,0.991257
id_23,585371,0.991247
id_22,585371,0.991247
dist2,552913,0.936284
D7,551623,0.934099


In [6]:
# Colonnes avec 1 ou plusieurs valeurs nulles
nan_cols = df.columns[df.isna().sum() > 0]
# Colonnes ayant le même nombre de valeurs manquantes
dup_cols = df[nan_cols].columns[[df[nan_cols].isna().sum().duplicated()]]
# Colonnes obtenues à l'étape précédente dont le nombre de valeurs négatives ne dépasse pas 1% des données
cols = df[dup_cols].columns[df[dup_cols].isna().sum() < (.01 * len(df))]
# Suppression des valeurs manquantes
len_before = len(df)
df.dropna(inplace=True, subset=cols)
# Validation
assert df[cols].isna().sum().sum() == 0., "Il reste des valeurs manquantes"
assert df.isna().sum().sum() > 0., "Mauvaise assignation: il devrait rester des valeurs manquantes dans le reste des données"
diff = len_before - len(df)
print("Rows deleted: {}\nRatio: {:2.4f}".format(diff, diff / len_before))

  result = getitem(key)


Rows deleted: 1441
Ratio: 0.0024


In [7]:
# Affichage des colonnes dont le ratio de valeurs manquantes dépassent 50% des données
cols_above_thresh = df.columns[(df.isna().sum() / len(df)) > 0.5]
cols_above_thresh

Index(['dist1', 'dist2', 'R_emaildomain', 'D5', 'D6', 'D7', 'D8', 'D9', 'D12',
       'D13',
       ...
       'id_23', 'id_24', 'id_25', 'id_26', 'id_30', 'id_31', 'id_32', 'id_33',
       'id_34', 'DeviceInfo'],
      dtype='object', length=200)

In [8]:
# Fonction grandement inspirée de https://www.kaggle.com/code/davidcairuz/feature-engineering-lightgbm
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0].str.split(' ', expand=True)[0]
    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    
    return dataframe

In [9]:
# Conversion de la colonne "DeviceInfo" vers le nouvel attribut "device_name"
df = id_split(df)
df["device_name"] = df["device_name"].fillna("NoDevice")
# Retrait des colonnes "DeviceInfo" et "DeviceType" qui deviennent redondantes
df = df.drop(["DeviceInfo", "DeviceType"], axis=1, errors="ignore")

In [10]:
def mean_target_encoding(dataframe, column, na_value):
    dataframe[column] = dataframe[column].fillna(na_value)
    mean_enc = dataframe.groupby(column)["isFraud"].mean()
    dataframe.loc[:, column] = dataframe.loc[:, column].map(mean_enc)
    return dataframe

In [11]:
cols = ["device_name", "P_emaildomain", "R_emaildomain", "id_31"]
na_vals = ["NoDevice", "NoEmail", "NoEmail", "NoBrowser"]
for c, v in zip(cols, na_vals):
    df = mean_target_encoding(df, c, v)
    assert df[c].isna().sum() == 0, "La colonne '%s' contient toujours des valeurs manquantes" % c

In [12]:
cols_above_thresh = df.columns[(df.isna().sum() / len(df)) >= 0.5]
cols_above_thresh

Index(['dist1', 'dist2', 'D5', 'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',
       ...
       'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_30', 'id_32',
       'id_33', 'id_34'],
      dtype='object', length=197)

In [13]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Réordonner les colonnes
df = df[list(set(df.columns) - {"isFraud"}) + ["isFraud"]]

# Affichage des colonnes dont le ratio de valeurs manquantes représente moins de 50% des données
# (et qui contiennent au moins une valeur manquante)
cols_below_thresh = df.columns[(df.isna().sum() / len(df)) < 0.5]
cols_below_thresh = df[cols_below_thresh].columns[df[cols_below_thresh].isna().sum() > 0]

# Suppression des attributs dont le nombre de valeurs manquantes dépasse 50% des données originales
df = df.drop(cols_above_thresh, axis=1)
df = df.drop("TransactionID", axis=1)

# Attributs catégoriques
nan_cols = df.select_dtypes(exclude=np.number).columns
df[nan_cols] = df[nan_cols].fillna(df[nan_cols].mode())
# Pour une raison quelconque, on doit manuellement modifier la colonne M4.
df["M4"] = df["M4"].fillna("M0")
df["card4"] = df["card4"].fillna("other")
df["card6"] = df["card6"].fillna("other")

# Attributs numériques
num_cols = list(set(df.select_dtypes(include=np.number).columns) - {"isFraud"})

# Stratégie 1: On remplace les valeurs manquantes des autres attributs en utilisant l'algorithme KNN
imputer = KNNImputer(n_neighbors=30)
full_data = imputer.fit_transform(df.values)
knn_df = df.copy()
knn_df[num_cols] = full_data

# Stratégie 2: On remplace les valeurs manquantes par la moyenne de leur attribut respectif
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Vérifications
assert df.isna().sum().sum() == 0, "there are still columns with missing values (mean df)"
assert knn_df.isna().sum().sum() == 0, "there are still columns with missing values (knn df)"

# Normalisation des attributs numériques

scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols].values)
knn_df[num_cols] = scaler.fit_transform(knn_df[num_cols].values)

# Conversion des attributs catégoriques en attributs binaires
n_cols_before = len(df.columns)
df = pd.get_dummies(df)
n_cols_after = len(df.columns)
print("Number of columns before one-hot encoding: {}".format(n_cols_before))
print("Number of columns after one-hot encoding: {}".format(n_cols_after))
print("Difference: {}".format(n_cols_after - n_cols_before))

# Sauvegarde des données
df.to_csv("./data/ieee-fraud-detection-v1.csv", index=False)
knn_df.to_csv("./data/ieee-fraud-detection-vknn.csv", index=False)

Number of columns before one-hot encoding: 211
Number of columns after one-hot encoding: 225
Difference: 14


In [20]:
df = df[list(set(df.columns) - {"isFraud"}) + ["isFraud"]]
df.to_csv("./data/ieee-fraud-detection-v1.csv", index=False)