In [None]:
print("Big Data Project 6")

In [46]:
import pandas as pd
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder, StandardScaler
from dask_ml.impute import SimpleImputer
from dask_ml.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit
from sklearn.metrics import classification_report

# Define the columns to load and their data types
selected_columns = ['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 
                    'card3', 'card4', 'card5', 'card6', 'dist1', 'dist2']

dtypes = {
    'TransactionID': 'int64',
    'TransactionDT': 'int64',
    'TransactionAmt': 'float64',
    'ProductCD': 'object',
    'card1': 'int64',
    'card2': 'float64',
    'card3': 'float64',
    'card4': 'object',
    'card5': 'float64',
    'card6': 'object',
    'dist1': 'float64',
    'dist2': 'float64'
}


# Load dataset
df = dd.read_csv(r'C:\Users\SvenEggers\.kaggle\train_transaction.csv', usecols=selected_columns + ['isFraud'], dtype=dtypes).set_index('TransactionID')

# Identify numeric and categorical columns
numeric_columns = ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'dist1', 'dist2']
categorical_columns = ['ProductCD', 'card4', 'card6']

# Handle missing values for numeric columns
imputer_numeric = SimpleImputer(strategy='mean')
df[numeric_columns] = imputer_numeric.fit_transform(df[numeric_columns])

# Handle missing values for categorical columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = imputer_categorical.fit_transform(df[categorical_columns])

# Convert categorical columns to categorical dtype
df[categorical_columns] = df[categorical_columns].categorize()

# Encode categorical variables
encoder = DummyEncoder()
df = encoder.fit_transform(df)

# Scale numeric features
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])


In [47]:
print(df.columns)

Index(['isFraud', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3',
       'card5', 'dist1', 'dist2', 'ProductCD_C', 'ProductCD_H', 'ProductCD_R',
       'ProductCD_S', 'ProductCD_W', 'card4_american express',
       'card4_discover', 'card4_mastercard', 'card4_visa', 'card6_charge card',
       'card6_credit', 'card6_debit', 'card6_debit or credit'],
      dtype='object')


In [48]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

# Convert to Dask array
X = X.to_dask_array(lengths=True)
y = y.to_dask_array(lengths=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Use ParallelPostFit to work with Dask
model = ParallelPostFit(estimator=rf)

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Compute the classification report
report = classification_report(y_test.compute(), y_pred.compute())

print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113931
           1       0.84      0.45      0.58      4182

    accuracy                           0.98    118113
   macro avg       0.91      0.72      0.79    118113
weighted avg       0.98      0.98      0.97    118113



In [35]:
print(X.columns)

AttributeError: 'Array' object has no attribute 'columns'

In [49]:
import dask.dataframe as dd

# Define the path to the test dataset
test_dataset_path = r'C:\Users\SvenEggers\.kaggle\test_transaction.csv'

# Load the test dataset
test_df = dd.read_csv(test_dataset_path, usecols=['TransactionID'] + selected_columns[1:], dtype=dtypes).set_index('TransactionID')

print(test_df.columns)

Index(['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'dist1', 'dist2'],
      dtype='object')


In [50]:
# Identify numeric and categorical columns
numeric_columns_test = ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'dist1', 'dist2']
categorical_columns_test = ['ProductCD', 'card4', 'card6']

# Handle missing values for numeric columns
imputer_numeric_test = SimpleImputer(strategy='mean')
test_df[numeric_columns_test] = imputer_numeric_test.fit_transform(test_df[numeric_columns_test])

# Handle missing values for categorical columns
imputer_categorical_test = SimpleImputer(strategy='most_frequent')
test_df[categorical_columns_test] = imputer_categorical_test.fit_transform(test_df[categorical_columns_test])

# Convert categorical columns to categorical dtype
test_df[categorical_columns_test] = test_df[categorical_columns_test].categorize()

# Encode categorical variables
encoder_test = DummyEncoder()
test_df = encoder_test.fit_transform(test_df)

# Scale numeric features
scaler_test = StandardScaler()
test_df[numeric_columns_test] = scaler_test.fit_transform(test_df[numeric_columns_test])

# Ensure the missing 'card6_debit or credit' column is added with zeros
if 'card6_debit or credit' not in test_df.columns:
    test_df['card6_debit or credit'] = 0




In [51]:
print(test_df.columns)

Index(['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5',
       'dist1', 'dist2', 'ProductCD_C', 'ProductCD_H', 'ProductCD_R',
       'ProductCD_S', 'ProductCD_W', 'card4_american express',
       'card4_discover', 'card4_mastercard', 'card4_visa', 'card6_charge card',
       'card6_credit', 'card6_debit', 'card6_debit or credit'],
      dtype='object')


In [52]:
#------------------------------------------

# Predict isFraud on the test dataset
test_predictions = model.predict(test_df)


# Define the path to save the new submission file
submission_file_path = r'C:\Users\SvenEggers\.kaggle\sample_submission_1.csv'

# Load the sample submission file to get the TransactionID
submission_df = dd.read_csv(r'C:\Users\SvenEggers\.kaggle\sample_submission.csv')

# Ensure the submission_df is indexed by TransactionID to align with predictions
submission_df = submission_df.set_index('TransactionID')

# Convert predictions to a Dask DataFrame and align with the submission DataFrame
predictions_df = dd.from_pandas(pd.DataFrame({
    'TransactionID': test_df.index.compute(),
    'isFraud': test_predictions.compute()
}), npartitions=1).set_index('TransactionID')

# Merge the predictions with the submission file
submission_df = submission_df.drop(columns='isFraud', errors='ignore')  # Drop existing isFraud if present
submission_df = submission_df.merge(predictions_df, left_index=True, right_index=True, how='left')

# Save the updated submission file, replacing if it exists
submission_df.to_csv(submission_file_path, single_file=True)

print(f'Submission file saved to {submission_file_path}')



Submission file saved to C:\Users\SvenEggers\.kaggle\sample_submission_1.csv


In [53]:
# Count the numbers of isFraud in the submission

# Define the path to the sample submission dataset
sample_submission_path = r'C:\Users\SvenEggers\.kaggle\sample_submission_1.csv'

# Load the sample submission dataset
sample_submission_df = pd.read_csv(sample_submission_path)

# Check the first few rows to ensure it loaded correctly
print(sample_submission_df.head())

# Count the number of 1s in the 'isFraud' column
is_fraud_count = sample_submission_df['isFraud'].sum()

print(f"The number of 1s in the 'isFraud' column: {is_fraud_count}")

   TransactionID  isFraud
0        3663549        0
1        3663550        0
2        3663551        0
3        3663552        0
4        3663553        0
The number of 1s in the 'isFraud' column: 1517
