In [21]:
import pandas as pd
import dask.dataframe as dd

# Path to your CSV file
file_path = r'C:\Users\ahmad\PycharmProjects\big-data-fraud-detection\ieee-fraud-detection\train_transaction.csv'


# Use pandas to read in a small portion of the file to infer types
sample = pd.read_csv(file_path, nrows=100)

# Get the inferred data types
dtype_dict = sample.dtypes.apply(lambda x: x.name).to_dict()

# Now read the full dataset with Dask using these types
ddf = dd.read_csv(file_path, dtype=dtype_dict)

# Display the first few rows to ensure it loads correctly
print(ddf.head())

   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0        2987000        0          86400            68.5         W  13926   
1        2987001        0          86401            29.0         W   2755   
2        2987002        0          86469            59.0         W   4663   
3        2987003        0          86499            50.0         W  18132   
4        2987004        0          86506            50.0         H   4497   

   card2  card3       card4  card5  ... V330  V331  V332  V333  V334 V335  \
0    NaN  150.0    discover  142.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
1  404.0  150.0  mastercard  102.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
2  490.0  150.0        visa  166.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
3  567.0  150.0  mastercard  117.0  ...  NaN   NaN   NaN   NaN   NaN  NaN   
4  514.0  150.0  mastercard  102.0  ...  0.0   0.0   0.0   0.0   0.0  0.0   

  V336  V337  V338  V339  
0  NaN   NaN   NaN   NaN  
1  NaN   NaN   NaN  

In [1]:

from dask.distributed import Client

client = Client(n_workers=2, threads_per_worker=2)

In [24]:
info = ddf.info()
print(info)

<class 'dask_expr.DataFrame'>
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), string(14)None


In [25]:

# Calculate the count of nulls for each column
null_counts = ddf.isnull().sum().compute()

# Calculate the total number of entries in the DataFrame
total_entries = len(ddf)

# Set a threshold for missing values (e.g., columns with more than 50% missing values)
threshold_percentage = 50
threshold = total_entries * (threshold_percentage / 100)

# Identify columns that have missing values above the threshold
columns_to_drop = null_counts[null_counts > threshold].index
print(columns_to_drop)


Index(['dist1', 'dist2', 'R_emaildomain', 'D5', 'D6', 'D7', 'D8', 'D9', 'D12',
       'D13',
       ...
       'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338',
       'V339'],
      dtype='object', length=174)


In [26]:
# Drop these columns from the DataFrame
ddf = ddf.drop(columns=columns_to_drop)  # Drop columns and reassign

In [27]:
ddf=ddf.persist()

In [28]:
# Identify numerical and categorical columns
numerical_cols = ddf.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = [col for col in ddf.columns if str(ddf.dtypes[col]) in ['object', 'category', 'string', 'string[pyarrow]']]

for col in categorical_cols:
    ddf[col] = ddf[col].astype('string')
print(f"Columns numerical_cols: {len(numerical_cols)}")
print(f"Columns categorical_cols: {len(categorical_cols)}")

Columns numerical_cols: 211
Columns categorical_cols: 9


In [29]:
# Fill missing values for numerical columns with the mean of each column
for col in numerical_cols:
    # Compute the mean of the column (use .compute() to execute the calculation)
    mean_value = ddf[col].mean().compute()
    ddf[col] = ddf[col].fillna(mean_value)

# Fill missing values for categorical columns with the mode of each column
for col in categorical_cols:
    # Compute the mode. Note: Dask's mode computation might be less straightforward than pandas,
    # often requiring a workaround to get the first mode value because Dask might return multiple modes.
    mode_value = ddf[col].dropna().value_counts().idxmax().compute()
    ddf[col] = ddf[col].fillna(mode_value)

In [30]:
ddf=ddf.persist()

In [32]:
print(ddf.isnull().any().any().compute())


False


In [34]:
# Repartition to a single partition
ddf = ddf.repartition(npartitions=1)

# Save to CSV
ddf.to_csv('cleaned_transactions.csv', single_file=True, index=False)

['C:\\Users\\ahmad\\PycharmProjects\\big-data-fraud-detection\\cleaned_transactions.csv']

In [1]:

from dask.distributed import Client

client = Client(n_workers=2, threads_per_worker=2)

In [2]:
import pandas as pd
import dask.dataframe as dd
ddf=dd.read_csv('cleaned_transactions.csv')

In [3]:
# Identify numerical and categorical columns
numerical_cols = ddf.select_dtypes(include=['int64', 'float64']).columns

categorical_cols = [col for col in ddf.columns if str(ddf.dtypes[col]) in ['object', 'category', 'string', 'string[pyarrow]']]


In [4]:
for col in categorical_cols:
    ddf[col] = ddf[col].astype('string')
print(f"Columns numerical_cols: {numerical_cols}")
print(f"Columns categorical_cols: {categorical_cols}")

Columns numerical_cols: Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1',
       'card2', 'card3', 'card5', 'addr1', 'addr2',
       ...
       'V312', 'V313', 'V314', 'V315', 'V316', 'V317', 'V318', 'V319', 'V320',
       'V321'],
      dtype='object', length=211)
Columns categorical_cols: ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M6']


In [5]:

# Using Dask to perform one-hot encoding
ddf = ddf.categorize(columns=categorical_cols)  # Converts columns to categorical dtype
ddf = dd.get_dummies(ddf, columns=categorical_cols)  # Perform one-hot encoding


In [6]:
transactionIDs=ddf['TransactionID']

In [7]:
from dask_ml.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def scale_partition(partition):
    # Scale only the numerical columns
    partition[numerical_cols] = scaler.fit_transform(partition[numerical_cols])
    return partition

# Apply the scaler to each partition
ddf = ddf.map_partitions(scale_partition)

#persist the DataFrame to optimize further computations
ddf = ddf.persist()


In [9]:
print(ddf.head())

   TransactionID  isFraud  TransactionDT  TransactionAmt     card1     card2  \
0       0.000000      0.0   0.000000e+00        0.014123  0.743158  0.525111   
1       0.000022      0.0   9.840069e-07        0.005944  0.100851  0.608000   
2       0.000044      0.0   6.789648e-05        0.012156  0.210557  0.780000   
3       0.000065      0.0   9.741669e-05        0.010292  0.984993  0.934000   
4       0.000087      0.0   1.043047e-04        0.010292  0.201012  0.828000   

      card3     card5     addr1     addr2  ...  M1_T   M2_F  M2_T   M3_F  \
0  0.387597  0.306569  0.488636  0.831461  ...  True  False  True  False   
1  0.387597  0.014599  0.511364  0.831461  ...  True  False  True  False   
2  0.387597  0.481752  0.522727  0.831461  ...  True  False  True  False   
3  0.387597  0.124088  0.854545  0.831461  ...  True  False  True  False   
4  0.387597  0.014599  0.727273  0.831461  ...  True  False  True  False   

   M3_T  M4_M0  M4_M1  M4_M2   M6_F   M6_T  
0  True  False  F

In [10]:
import dask.dataframe as dd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from dask_ml.model_selection import train_test_split


X = ddf.drop(['isFraud', 'TransactionID'], axis=1)
y = ddf['isFraud']

# Split data using dask_ml's train_test_split which handles Dask DataFrames
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Compute the Dask DataFrames to pandas DataFrames and create a copy
X_train_computed = X_train.compute().copy()
y_train_computed = y_train.compute().copy()

X_test_computed=X_test.compute().copy()
y_test_computed=y_test.compute().copy()

from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
model = RandomForestClassifier()

model.fit(X_train_computed, y_train_computed)

# Make predictions
y_pred = model.predict(X_test_computed)

# Evaluate the model
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print("Accuracy:", accuracy_score(y_test_computed, y_pred))
print("Recall:", recall_score(y_test_computed, y_pred))
print("Precision:", precision_score(y_test_computed, y_pred))
print("F1 Score:", f1_score(y_test_computed, y_pred))


Accuracy: 0.98
Recall: 0.45396825396825397
Precision: 0.9426977687626775
F1 Score: 0.6128234712378441


In [11]:
# Get feature importances
importances = model.feature_importances_

# Create a DataFrame to view the importances more easily
import pandas as pd

features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df.head(20))

            Feature  Importance
1    TransactionAmt    0.038369
0     TransactionDT    0.037728
8                C1    0.037542
2             card1    0.032813
3             card2    0.026406
20              C13    0.025772
21              C14    0.024442
6             addr1    0.021865
9                C2    0.019567
18              C11    0.019169
11               C4    0.017112
13               C6    0.016210
15               C8    0.014424
5             card5    0.013847
73              V45    0.013162
115             V87    0.012994
114             V86    0.010787
19              C12    0.010691
28              D15    0.010215
23               D2    0.009746


In [13]:
# Select the top N most important features, for example top 50
top_features = importance_df['Feature'][:50]
print(top_features.values)

['TransactionAmt' 'TransactionDT' 'C1' 'card1' 'card2' 'C13' 'C14' 'addr1'
 'C2' 'C11' 'C4' 'C6' 'C8' 'card5' 'V45' 'V87' 'V86' 'C12' 'D15' 'D2'
 'C10' 'D1' 'D4' 'V44' 'V307' 'P_emaildomain_gmail.com' 'D10' 'V283'
 'card3' 'C7' 'V310' 'V308' 'D11' 'V317' 'D3' 'V23' 'V282' 'C9' 'V315'
 'V38' 'V127' 'V24' 'V285' 'V314' 'V313' 'card6_debit' 'V62' 'V78' 'V312'
 'V83']
