# **Building Risk Analysis Model**


## **Mount Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load Data**

In [2]:
import pandas as pd

data = pd.read_csv('/content/drive/My Drive/data.csv')
data

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-13T09:54:09Z,2,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2019-02-13T09:54:25Z,2,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-02-13T09:54:35Z,2,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000,2019-02-13T10:01:10Z,2,0


# **Data Preparation**

## **Handle missing values**

In [3]:
missing_data = data.isnull().sum()
print(missing_data)

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64


## **Converting the date to DateTime format**

In [4]:
# Convert TransactionStartTime to datetime format
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

# **Model Selection and Training**

## **Split the Data**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Prepare the data
X = data[['Amount', 'Value', 'PricingStrategy', 'FraudResult']]
y = data['FraudResult']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## **Choosing Models for Training**

By Choosing Logistic Regression and Random Forest as our models for this analysis

## **Train the Models**

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Train the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print(f'{name} Performance:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print(f'ROC-AUC: {roc_auc:.2f}')
    print()

Logistic Regression Performance:
Accuracy: 1.00
Precision: 0.82
Recall: 0.25
F1 Score: 0.38
ROC-AUC: 0.62

Decision Tree Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC-AUC: 1.00

Random Forest Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC-AUC: 1.00

Gradient Boosting Performance:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC-AUC: 1.00



## **Hyperparameter Tuning**

In [7]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for Logistic Regression
log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Define hyperparameters for Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Grid Search for Logistic Regression
log_reg_grid = GridSearchCV(LogisticRegression(), log_reg_params, cv=5, scoring='accuracy')
log_reg_grid.fit(X_train, y_train)

# Grid Search for Random Forest
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# Best models
best_log_reg = log_reg_grid.best_estimator_
best_random_forest = rf_grid.best_estimator_

## **Model Evaluation**

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Predictions
log_reg_preds = best_log_reg.predict(X_test)
rf_preds = best_random_forest.predict(X_test)

# Evaluation for Logistic Regression
log_reg_accuracy = accuracy_score(y_test, log_reg_preds)
log_reg_precision = precision_score(y_test, log_reg_preds)
log_reg_recall = recall_score(y_test, log_reg_preds)
log_reg_f1 = f1_score(y_test, log_reg_preds)
log_reg_roc_auc = roc_auc_score(y_test, best_log_reg.predict_proba(X_test)[:, 1])

# Evaluation for Random Forest
rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)
rf_roc_auc = roc_auc_score(y_test, best_random_forest.predict_proba(X_test)[:, 1])

# Print the results
print(f"Logistic Regression - Accuracy: {log_reg_accuracy}, Precision: {log_reg_precision}, Recall: {log_reg_recall}, F1 Score: {log_reg_f1}, ROC-AUC: {log_reg_roc_auc}")
print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1 Score: {rf_f1}, ROC-AUC: {rf_roc_auc}")

Logistic Regression - Accuracy: 0.998484294151466, Precision: 0.8181818181818182, Recall: 0.25, F1 Score: 0.38297872340425526, ROC-AUC: 0.9190928767171108
Random Forest - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0, ROC-AUC: 1.0


In [11]:
# Create a DataFrame to display the results
results = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC"],
    "Logistic Regression": [
        log_reg_accuracy, log_reg_precision, log_reg_recall, log_reg_f1, log_reg_roc_auc
    ],
    "Random Forest": [
        rf_accuracy, rf_precision, rf_recall, rf_f1, rf_roc_auc
    ]
}

results_df = pd.DataFrame(results)

# Display the DataFrame
print(results_df)

      Metric  Logistic Regression  Random Forest
0   Accuracy             0.998484            1.0
1  Precision             0.818182            1.0
2     Recall             0.250000            1.0
3   F1 Score             0.382979            1.0
4    ROC-AUC             0.919093            1.0
