In [3]:
# Install necessary packages
!pip install joblib

# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Load dataset from local file
data = pd.read_csv('data/insurance_claims.csv')

# Preprocess the data
data = data.drop(['policy_number', 'policy_bind_date', 'incident_date', 'insured_zip'], axis=1)
data['collision_type'].fillna('NA', inplace=True)
data['property_damage'].fillna('NA', inplace=True)
data['police_report_available'].fillna('NA', inplace=True)

# Encoding categorical variables
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Scaling numerical features
scaler = StandardScaler()
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Splitting the data into train an test split
X = data.drop('fraud_reported', axis=1)
y = data['fraud_reported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training and evaluate models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Isolation Forest': IsolationForest(contamination=0.1)
}

results = {}
for model_name, model in models.items():
    if model_name == 'Isolation Forest':
        model.fit(X_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x == -1 else 0 for x in y_pred]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T
print(results_df)

# We are performing Hyperparameter tuning for Random Forest
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Evaluate tuned model
y_pred = best_rf.predict(X_test)
print(f"Tuned Random Forest Precision: {precision_score(y_test, y_pred)}")
print(f"Tuned Random Forest Recall: {recall_score(y_test, y_pred)}")
print(f"Tuned Random Forest F1 Score: {f1_score(y_test, y_pred)}")

# Saving the best model in pkl file
joblib.dump(best_rf, 'models/fraud_model.pkl')





  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values