In [2]:
# -----------------------------------------
# WEEK 6: MACHINE LEARNING CLASSIFICATION
# -----------------------------------------

# --- Step 1: Import Libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- Step 2: Load Dataset ---
df = pd.read_csv('/content/students.csv', sep=';')


# --- Step 3: Understanding Target Variable ---
# G3 = Final Grade of the student (0–20)
# We’ll predict whether a student passes (G3 >= 10) or fails (G3 < 10)
df['pass'] = df['G3'].apply(lambda x: 1 if x >= 10 else 0)

# --- Step 4: Select Features for Training ---
# Use academic & personal factors that might affect final performance
features = ['G1', 'G2', 'studytime', 'failures', 'absences']
X = df[features]
y = df['pass']

# --- Step 5: Split Dataset into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training and Testing Data Split Completed.\n")

# =========================================
# CLASS TASK: Decision Tree & Random Forest
# =========================================

print("----- CLASS TASK: TRAIN MODELS -----\n")

# --- Decision Tree Classifier ---
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)
print(f"Decision Tree Accuracy: {dt_acc:.2f}")

# --- Random Forest Classifier ---
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_acc:.2f}\n")

# --- Explanation ---
print("Summary of Class Task:")
print("- Decision Tree: A single model that splits data based on features like G1 and G2 to predict pass/fail.")
print("- Random Forest: Builds many Decision Trees and combines them for better accuracy and stability.\n")

# =========================================
# ASSIGNMENT 6: Logistic Regression & Random Forest
# =========================================

print("----- ASSIGNMENT 6: APPLY MODELS & COMPARE ACCURACY -----\n")

# --- Logistic Regression ---
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)
log_acc = accuracy_score(y_test, log_pred)
print(f"Logistic Regression Accuracy: {log_acc:.2f}")

# --- Random Forest (already trained above) ---
print(f"Random Forest Accuracy (reused): {rf_acc:.2f}\n")

# --- Accuracy Comparison ---
print("Accuracy Comparison:")
print(f"  Logistic Regression: {log_acc:.2f}")
print(f"  Random Forest:       {rf_acc:.2f}\n")

# --- Final Explanation ---
print("Explanation of Models:")
print("- Logistic Regression: Predicts probability of Students passing based on numeric relationships in data.")
print("- Random Forest: Uses multiple trees to make stronger, more reliable predictions.")
print("\nConclusion:")
if rf_acc > log_acc:
    print("→ Random Forest performed better, meaning combining many trees gives more accurate results.")
else:
    print("→ Logistic Regression performed better, meaning linear relationships explained the grades well.")


Training and Testing Data Split Completed.

----- CLASS TASK: TRAIN MODELS -----

Decision Tree Accuracy: 0.92
Random Forest Accuracy: 0.90

Summary of Class Task:
- Decision Tree: A single model that splits data based on features like G1 and G2 to predict pass/fail.
- Random Forest: Builds many Decision Trees and combines them for better accuracy and stability.

----- ASSIGNMENT 6: APPLY MODELS & COMPARE ACCURACY -----

Logistic Regression Accuracy: 0.93
Random Forest Accuracy (reused): 0.90

Accuracy Comparison:
  Logistic Regression: 0.93
  Random Forest:       0.90

Explanation of Models:
- Logistic Regression: Predicts probability of Students passing based on numeric relationships in data.
- Random Forest: Uses multiple trees to make stronger, more reliable predictions.

Conclusion:
→ Logistic Regression performed better, meaning linear relationships explained the grades well.
