In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "charity_request_dataset.csv"

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
    exit()

# Print the actual column names for verification
print("Column names in dataset:", df.columns.tolist())

# Clean column names (strip spaces, lowercase, replace spaces with underscores)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("Updated column names:", df.columns.tolist())

# Verify if target column exists
target_column = "is_necessary"  # Adjust based on your dataset
if target_column not in df.columns:
    print(f"Error: Target column '{target_column}' not found. Available columns: {df.columns.tolist()}")
    exit()

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

print("Model training and evaluation completed successfully!")



Dataset loaded successfully!
Column names in dataset: ['Institute Type', 'Requested Item', 'Quantity Requested', 'Historical Need', 'Emergency Level', 'Previous Fraud Flag', 'Supplier Availability', 'Required']
Updated column names: ['institute_type', 'requested_item', 'quantity_requested', 'historical_need', 'emergency_level', 'previous_fraud_flag', 'supplier_availability', 'required']
Error: Target column 'is_necessary' not found. Available columns: ['institute_type', 'requested_item', 'quantity_requested', 'historical_need', 'emergency_level', 'previous_fraud_flag', 'supplier_availability', 'required']


KeyError: "['is_necessary'] not found in axis"

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Set random seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 1000

# Generate synthetic dataset
data = {
    "donation_amount": np.random.uniform(10, 5000, num_records),  # Donation amount between 10 and 5000
    "num_donations": np.random.randint(1, 50, num_records),  # Number of donations made
    "charity_type": np.random.choice(["Health", "Education", "Environment", "Animal Welfare", "Disaster Relief"], num_records),
    "donor_age": np.random.randint(18, 80, num_records),  # Age of donor
    "donation_method": np.random.choice(["Online", "Offline", "Check"], num_records),
    "donor_location": np.random.choice(["Urban", "Suburban", "Rural"], num_records),
    "past_fraud_flag": np.random.choice([0, 1], num_records, p=[0.9, 0.1]),  # 10% fraud cases
    "is_necessary": np.random.choice([0, 1], num_records, p=[0.6, 0.4])  # 60% not necessary, 40% necessary
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save dataset (optional)
df.to_csv("synthetic_charity_fraud_dataset.csv", index=False)

# Print first few rows
print("Generated Dataset Sample:\n", df.head())

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert categories to numbers
    label_encoders[col] = le

# Define target variable
target_column = "is_necessary"

# Features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")

# Display classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nModel training and testing completed successfully!")


Generated Dataset Sample:
    donation_amount  num_donations     charity_type  donor_age donation_method  \
0      1878.955193             47      Environment         54           Check   
1      4754.064389             12  Disaster Relief         52          Online   
2      3662.649770             16   Animal Welfare         40           Check   
3      2997.305836             24   Animal Welfare         23          Online   
4       788.533016             19           Health         21          Online   

  donor_location  past_fraud_flag  is_necessary  
0          Urban                0             1  
1       Suburban                0             0  
2          Rural                0             1  
3       Suburban                0             1  
4          Urban                0             0  

Model Accuracy: 0.58

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.76      0.69       123
           1       0.44      0

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Load Dataset (Replace with actual dataset file path)
# Assuming CSV file with last column as target
df = pd.read_csv("your_dataset.csv")  # Change to actual file name

# Separate features and target variable
X = df.iloc[:, :-1]  # All columns except last
y = df.iloc[:, -1]   # Last column as target

# 2. Handle Missing Values
imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X = imputer.fit_transform(X)

# 3. Handle Class Imbalance using SMOTE
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 5. Train the Model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 6. Predict and Evaluate
y_pred = model.predict(X_test)

# Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'