In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the dataset
file_path = 'student_data.csv'  # Change this to your dataset file path
df = pd.read_csv(file_path)

# Feature Engineering: Create total score from subject scores
df['Total_Score'] = df.iloc[:, 1:].sum(axis=1)  # Assuming first column is student ID or name

# Define features and target
X = df.drop(columns=['Total_Score'])  # Features
y = df['Total_Score']  # Target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
MAE: 0.35509426729484694
RMSE: 0.49614969459023495




In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Load and preprocess the dataset
file_path = 'Transactions.csv'  # Change this to your dataset file path
df = pd.read_csv(file_path)

# Display dataset columns to check for missing or unexpected names
print("Dataset Columns:", df.columns)

# Handle missing values
df.fillna(method='ffill', inplace=True)

# Verify if 'Transaction_Type' column exists before applying label encoding
label_encoders = {}
categorical_columns = [col for col in ['Transaction_Type'] if col in df.columns]  # Ensure column exists
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Feature Engineering: Create new derived features if necessary
if 'Transaction_Amount' in df.columns:
    df['Transaction_Amount_Log'] = np.log1p(df['Transaction_Amount'])  # Example feature

# Define features and target
if 'Fraudulent' in df.columns:
    X = df.drop(columns=['Fraudulent'])  # Features
    y = df['Fraudulent']  # Target

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Decision Tree Classifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model performance
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
else:
    print("Error: 'Fraudulent' column not found in the dataset.")

Dataset Columns: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Error: 'Fraudulent' column not found in the dataset.


  df.fillna(method='ffill', inplace=True)
