## Lab for Model Training

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# 1. Generate Malformed Dataset
np.random.seed(42)
random.seed(42)

In [None]:
# Create a dataset
data = {
    'Age': [25, 38, np.nan, 45, -1, 120, 34, None, 29, 50],
    'Salary ($)': [50000, 60000, 75000, None, 120000, 45000, 70000, 1000000, None, 62000],
    'Department': ['HR', 'Sales', 'HR', 'IT', 'IT', 'Finance', 'Sales', None, 'HR', 'Finance'],
    'Performance': ["Good", "Poor", "Excellent", "Good", "Poor", None, "Average", "Poor", "Good", "Invalid"],
    'Promotion': [0, 1, 0, 1, 1, 0, 0, 1, 0, 0]  # Target variable (binary classification)
}

In [None]:

# Introduce duplicates and inconsistent data
data['Salary ($)'][7] = 1000000  # Extreme outlier
data['Department'][9] = 'Finance '  # Trailing space
data['Performance'][9] = 'Good '  # Trailing space


In [None]:

# Create DataFrame
df = pd.DataFrame(data)

In [None]:
# Display Initial Dataset
print("Malformed Dataset:")
display(df)

In [None]:
# Save initial dataset as CSV
df.to_csv('malformed_data.csv', index=False)

# ------------------------------------
# 2. Data Cleaning and Transformation
# ------------------------------------

In [None]:
# Step 1: Handle Missing Values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary ($)'].fillna(df['Salary ($)'].median(), inplace=True)
df = df.dropna(subset=['Department', 'Performance'])

In [None]:
# Step 2: Clean Categorical Data
df['Department'] = df['Department'].str.strip()
df['Performance'] = df['Performance'].str.strip().replace("Invalid", "Average")

In [None]:
# Step 3: Handle Outliers
# Replace negative and extreme "Age" values
df['Age'] = df['Age'].apply(lambda x: max(0, min(x, 100)))

In [None]:
# Remove rows with extreme outliers in "Salary ($)"
df = df[df['Salary ($)'] < 500000]

In [None]:
# Step 4: Encode Categorical Variables
label_encoder = LabelEncoder()
df['Department'] = label_encoder.fit_transform(df['Department'])
df['Performance'] = label_encoder.fit_transform(df['Performance'])

In [None]:
# Step 5: Normalize Numeric Data
scaler = StandardScaler()
df[['Age', 'Salary ($)']] = scaler.fit_transform(df[['Age', 'Salary ($)']])

In [None]:
# Display Cleaned Dataset
print("Cleaned Dataset:")
display(df)

In [None]:
# -------------------------------------
# 3. Model Training and Evaluation
# -------------------------------------

In [None]:
# Split Data into Features and Target
X = df.drop(columns=['Promotion'])
y = df['Promotion']

In [None]:
# Split Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make Predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate Model Performance
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

In [None]:
# --------------------------------
# 4. Save the Model (Optional)
# --------------------------------

In [None]:
import joblib

# Save the trained model for later use
joblib.dump(model, 'trained_model.pkl')

print("\nModel saved as 'trained_model.pkl'")


In [None]:
# Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance:")
display(feature_importance_df)

# Drop low-importance features (if any have near-zero importance)
low_importance_features = feature_importance_df[feature_importance_df['Importance'] < 0.01]['Feature']
X_train = X_train.drop(columns=low_importance_features)
X_test = X_test.drop(columns=low_importance_features)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Grid Search
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=3, 
                           n_jobs=-1, 
                           verbose=2)

grid_search.fit(X_train, y_train)

# Use the best parameters for the model
best_model = grid_search.best_estimator_

# Evaluate the tuned model
y_pred_tuned = best_model.predict(X_test)
print("\nTuned Model Accuracy Score:", accuracy_score(y_test, y_pred_tuned))


In [None]:
pip install xgboost

In [None]:
import xgboost as xgb
print("XGBoost successfully installed!")

from xgboost import XGBClassifier

# Train an XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost model
y_pred_xgb = xgb_model.predict(X_test)
print("\nXGBoost Model Accuracy Score:", accuracy_score(y_test, y_pred_xgb))


In [None]:
# Add interaction features
X['Age_Salary'] = X['Age'] * X['Salary ($)']

# Re-split the data after adding features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=3, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))




Overfitting: The model might be overfitting on some folds, achieving perfect accuracy (1.0) but failing on others (0.0 or 0.5).
Data Issues: Class imbalance, small dataset size, or noise in the data could be causing performance inconsistencies.
Cross-Validation Splits: Uneven splits during cross-validation could result in imbalanced training and test sets.