In [1]:
print("Big Data Project 6")

Big Data Project 6


In [2]:
import pandas as pd
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
from dask_ml.impute import SimpleImputer
from dask_ml.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit
from sklearn.metrics import classification_report
uri="mongodb+srv://admin:admin@cluster0.3og2uv4.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
db_name='fraud_db'
collection_name='transactions'
from dask.distributed import Client

client = Client()  # Starts a local scheduler and worker if no arguments are provided
print(client)

# Define the columns to load and their data types
selected_columns = ['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 
                    'card3', 'card4', 'card5', 'card6', 'dist1', 'dist2']

dtypes = {
    'TransactionID': 'int64',
    'TransactionDT': 'int64',
    'TransactionAmt': 'float64',
    'ProductCD': 'object',
    'card1': 'int64',
    'card2': 'float64',
    'card3': 'float64',
    'card4': 'object',
    'card5': 'float64',
    'card6': 'object',
    'dist1': 'float64',
    'dist2': 'float64'
}


# Load dataset
df = dd.read_csv('train_transaction.csv', usecols=selected_columns + ['isFraud'], dtype=dtypes).set_index('TransactionID')

# Identify numeric and categorical columns
numeric_columns = ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'dist1', 'dist2']
categorical_columns = ['ProductCD', 'card4', 'card6']

# Take a small sample to fit the imputer and scaler
sample = df.sample(frac=0.1, random_state=42).compute()

# Fit the imputers
imputer_numeric = SimpleImputer(strategy='mean').fit(sample[numeric_columns])
imputer_categorical = SimpleImputer(strategy='most_frequent').fit(sample[categorical_columns])

# Fit the scaler
scaler = StandardScaler().fit(sample[numeric_columns])

# Handle missing values for numeric columns in parallel
df[numeric_columns] = df[numeric_columns].map_partitions(lambda df: pd.DataFrame(imputer_numeric.transform(df), columns=df.columns), meta=df[numeric_columns]._meta)

# Handle missing values for categorical columns in parallel
df[categorical_columns] = df[categorical_columns].map_partitions(lambda df: pd.DataFrame(imputer_categorical.transform(df), columns=df.columns), meta=df[categorical_columns]._meta)

# Convert categorical columns to categorical dtype
df[categorical_columns] = df[categorical_columns].categorize()

# Encode categorical variables
encoder = DummyEncoder()
df = encoder.fit_transform(df)

# Scale numeric features in parallel
df[numeric_columns] = df[numeric_columns].map_partitions(lambda df: pd.DataFrame(scaler.transform(df), columns=df.columns), meta=df[numeric_columns]._meta)


<Client: 'tcp://127.0.0.1:54186' processes=4 threads=8, memory=15.69 GiB>


In [3]:
print(df.columns)

Index(['isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3',
       'card5', 'dist1', 'dist2', 'ProductCD_C', 'ProductCD_H', 'ProductCD_R',
       'ProductCD_S', 'ProductCD_W', 'card4_american express',
       'card4_discover', 'card4_mastercard', 'card4_visa', 'card6_charge card',
       'card6_credit', 'card6_debit', 'card6_debit or credit'],
      dtype='object')


In [4]:
from dask.distributed import get_worker
from pymongo import MongoClient

def get_mongo_client():
    worker = get_worker()  # This raises ValueError if not running within a worker context
    if not hasattr(worker, 'mongo_client'):
        connection_string = uri
        worker.mongo_client = MongoClient(connection_string)
    return worker.mongo_client

def insert_into_mongo(df_part):
    try:
        client = get_mongo_client()
        db = client.fraud_db
        transactions_collection = db.transactions
        records = df_part.to_dict(orient='records')
        if records:
            transactions_collection.insert_many(records)
    except ValueError:
        print("Not running on a worker. Proper MongoDB operations can't be performed.")

def load_data_from_mongo(uri, db_name, collection_name):
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    cursor = collection.find({})

    # Convert cursor to DataFrame directly if memory allows
    return pd.DataFrame(list(cursor))


In [19]:
df.map_partitions(insert_into_mongo, meta=int).compute()




0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object

In [20]:
#Fetch Data from MongoDB
#from dask.delayed import delayed
#chunks = load_data_from_mongo(uri, db_name, collection_name, chunk_size=10000)

# Create Dask DataFrame from delayed chunks
#ddf = dd.from_delayed([delayed(pd.DataFrame)(chunk) for chunk in chunks])

In [5]:
# Convert to Dask array
X = df.drop('isFraud', axis=1).to_dask_array(lengths=True)
y = df['isFraud'].to_dask_array(lengths=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Use ParallelPostFit to work with Dask
model = ParallelPostFit(estimator=rf)

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Compute the classification report
report = classification_report(y_test.compute(), y_pred.compute())

print(report)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113931
           1       0.85      0.45      0.58      4182

    accuracy                           0.98    118113
   macro avg       0.91      0.72      0.79    118113
weighted avg       0.98      0.98      0.97    118113



In [12]:
#Predict isFraud on test dataset

import dask.dataframe as dd

# Load the test dataset
test_df = dd.read_csv('test_transaction.csv', usecols=['TransactionID'] + selected_columns[1:], dtype=dtypes).set_index('TransactionID')

print(test_df.columns)

Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'dist1', 'dist2'],
      dtype='object')


In [13]:
# Identify numeric and categorical columns
numeric_columns_test = ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'dist1', 'dist2']
categorical_columns_test = ['ProductCD', 'card4', 'card6']

# Handle missing values for numeric columns
test_df[numeric_columns_test] = test_df[numeric_columns_test].map_partitions(lambda df: imputer_numeric.fit_transform(df), meta=test_df[numeric_columns_test]._meta)

# Handle missing values for categorical columns
test_df[categorical_columns_test] = test_df[categorical_columns_test].map_partitions(lambda df: imputer_categorical.fit_transform(df), meta=test_df[categorical_columns_test]._meta)

# Convert categorical columns to categorical dtype
test_df[categorical_columns_test] = test_df[categorical_columns_test].categorize()

# Encode categorical variables
encoder_test = DummyEncoder()
test_df = encoder_test.fit_transform(test_df)

# Take a small sample to fit the scaler
sample_test = test_df.sample(frac=0.1, random_state=42).compute()

# Fit the scaler on the sample
scaler_test = StandardScaler().fit(sample_test[numeric_columns_test])

# Scale numeric features in parallel
test_df[numeric_columns_test] = test_df[numeric_columns_test].map_partitions(lambda df: pd.DataFrame(scaler_test.transform(df), columns=df.columns), meta=test_df[numeric_columns_test]._meta)

# Ensure the missing 'card6_debit or credit' column is added with zeros
if 'card6_debit or credit' not in test_df.columns:
    test_df['card6_debit or credit'] = 0


In [14]:
print(test_df.columns)

Index(['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5',
       'dist1', 'dist2', 'ProductCD_C', 'ProductCD_H', 'ProductCD_R',
       'ProductCD_S', 'ProductCD_W', 'card4_american express',
       'card4_discover', 'card4_mastercard', 'card4_visa', 'card6_charge card',
       'card6_credit', 'card6_debit', 'card6_debit or credit'],
      dtype='object')


In [15]:
# Predict isFraud on the test dataset
test_predictions = model.predict(test_df)

# Define the path to save the new submission file
submission_file_path = r'C:\Users\SvenEggers\.kaggle\sample_submission_1.csv' #REPLACE WITH YOUR WORKING TRAJECTORY

# Load the sample submission file to get the TransactionID
submission_df = dd.read_csv('sample_submission.csv').set_index('TransactionID')

# Convert predictions to a Dask DataFrame and align with the submission DataFrame
predictions_df = dd.from_pandas(pd.DataFrame({
    'TransactionID': test_df.index.compute(),
    'isFraud': test_predictions.compute()
}), npartitions=1).set_index('TransactionID')

# Merge the predictions with the submission file
submission_df = submission_df.drop(columns='isFraud', errors='ignore')  # Drop existing isFraud if present
submission_df = submission_df.merge(predictions_df, left_index=True, right_index=True, how='left')

# Save the updated submission file, replacing if it exists
submission_df.to_csv(submission_file_path, single_file=True)

print(f'Submission file saved to {submission_file_path}')

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Submission file saved to C:\Users\SvenEggers\.kaggle\sample_submission_1.csv


In [17]:
# Count the numbers of isFraud in the submission

# Load the sample submission dataset
sample_submission_df = pd.read_csv('sample_submission_1.csv')

# Check the first few rows to ensure it loaded correctly
print(sample_submission_df.head())

# Count the number of 1s in the 'isFraud' column
is_fraud_count = sample_submission_df['isFraud'].sum()

print(f"The number of 1s in the 'isFraud' column: {is_fraud_count}")

   TransactionID  isFraud
0        3663549        0
1        3663550        0
2        3663551        0
3        3663552        0
4        3663553        0
The number of 1s in the 'isFraud' column: 1148
