# Machine Learning Template

**Author:** Your Name

**Date:** YYYY-MM-DD

**Problem Type:** Classification / Regression / Clustering

**Objective:** Describe the ML objective

---

## 1. Environment Setup

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

# Models (import as needed)
# from sklearn.linear_model import LogisticRegression, LinearRegression
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.svm import SVC
# from sklearn.neural_network import MLPClassifier

# Settings
%matplotlib inline
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 2. Load and Explore Data

In [None]:
# Load dataset
# df = pd.read_csv('data/your_dataset.csv')
# print(f"Dataset shape: {df.shape}")
# df.head()

In [None]:
# Data info and statistics
# print(df.info())
# print("\n" + "="*50)
# print(df.describe())

## 3. Data Preprocessing

In [None]:
# Handle missing values
# print("Missing values:\n", df.isnull().sum())
# df_clean = df.dropna()  # or use fillna()

In [None]:
# Encode categorical variables
# label_encoders = {}
# for column in df_clean.select_dtypes(include=['object']).columns:
#     if column != 'target':  # Don't encode target yet
#         le = LabelEncoder()
#         df_clean[column] = le.fit_transform(df_clean[column])
#         label_encoders[column] = le

In [None]:
# Split features and target
# X = df_clean.drop('target_column', axis=1)
# y = df_clean['target_column']

# print(f"Features shape: {X.shape}")
# print(f"Target shape: {y.shape}")

## 4. Feature Engineering

In [None]:
# Create new features
# X['new_feature'] = X['feature1'] * X['feature2']
# X['ratio_feature'] = X['feature1'] / (X['feature2'] + 1)

In [None]:
# Feature selection (optional)
# from sklearn.feature_selection import SelectKBest, f_classif
# selector = SelectKBest(f_classif, k=10)
# X_selected = selector.fit_transform(X, y)
# selected_features = X.columns[selector.get_support()]
# print("Selected features:", selected_features.tolist())

## 5. Train-Test Split

In [None]:
# Split the data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
# )

# print(f"Training set size: {X_train.shape}")
# print(f"Test set size: {X_test.shape}")

In [None]:
# Scale features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

## 6. Model Training

In [None]:
# Initialize and train model
# model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
# model.fit(X_train_scaled, y_train)
# print("Model training completed")

In [None]:
# Cross-validation
# cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
# print(f"Cross-validation scores: {cv_scores}")
# print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## 7. Model Evaluation

In [None]:
# Make predictions
# y_pred_train = model.predict(X_train_scaled)
# y_pred_test = model.predict(X_test_scaled)

# Calculate accuracy
# train_accuracy = accuracy_score(y_train, y_pred_train)
# test_accuracy = accuracy_score(y_test, y_pred_test)

# print(f"Training Accuracy: {train_accuracy:.4f}")
# print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
# Classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred_test))

In [None]:
# Confusion matrix
# cm = confusion_matrix(y_test, y_pred_test)
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.title('Confusion Matrix')
# plt.ylabel('True Label')
# plt.xlabel('Predicted Label')
# plt.show()

## 8. Feature Importance

In [None]:
# Plot feature importance (for tree-based models)
# if hasattr(model, 'feature_importances_'):
#     feature_importance = pd.DataFrame({
#         'feature': X.columns,
#         'importance': model.feature_importances_
#     }).sort_values('importance', ascending=False)
#     
#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
#     plt.title('Top 10 Feature Importances')
#     plt.tight_layout()
#     plt.show()

## 9. Hyperparameter Tuning (Optional)

In [None]:
# Define parameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# Grid search
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1
# )
# grid_search.fit(X_train_scaled, y_train)

# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

## 10. Save Model

In [None]:
# Save the trained model
# import joblib
# joblib.dump(model, 'output/trained_model.pkl')
# joblib.dump(scaler, 'output/scaler.pkl')
# print("Model saved successfully")

## 11. Conclusions

### Model Performance

- Metric 1: Value
- Metric 2: Value
- Metric 3: Value

### Key Insights

1. Insight 1
2. Insight 2
3. Insight 3

### Next Steps

- [ ] Try different algorithms
- [ ] Collect more data
- [ ] Feature engineering improvements
- [ ] Deploy model to production