In [113]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor




import pandas as pd

In [114]:
df = pd.read_csv("df.csv")

In [115]:
features = ["Non-GCE Promote %", 
     "ELL %", 
     "Special Education %", 
     "FRPM_%",  
     "Chronically Absent Percent", 
     "AVG_ATT_RATE", 
     "Total Enrollment", 
     "Corp Per/Student", 
     "% Tested ELA", 
     "% Tested Math",
     "Pct American Indian",
     "Pct Asian",
     "Pct Black",
     "Pct Hispanic",
     "Pct Multiracial",
     "Pct Pacific Islander",
     "Pct White"]
Y = ["ELA Proficient %", "Math Proficient %"]
print(df[Y])
df = df[features + Y]

     ELA Proficient %  Math Proficient %
0            0.484034           0.611765
1            0.385740           0.339450
2            0.400356           0.521352
3            0.529885           0.528888
4            0.489105           0.463828
..                ...                ...
301          0.440678           0.367347
302          0.198052           0.098039
303          0.276332           0.168005
304          0.027778           0.027778
305          0.218978           0.182482

[306 rows x 2 columns]


In [116]:
X = df[features]
X_ela_train, X_ela_test, y_ela_train, y_ela_test = train_test_split(X, df['ELA Proficient %'], test_size=0.2, random_state=42)
X_math_train, X_math_test, y_math_train, y_math_test = train_test_split(X, df['Math Proficient %'], test_size=0.2, random_state=42)

scaler_ela = StandardScaler().fit(X_ela_train)
X_ela_train_scaled = scaler_ela.transform(X_ela_train)
X_ela_test_scaled  = scaler_ela.transform(X_ela_test)

scaler_math = StandardScaler().fit(X_math_train)
X_math_train_scaled = scaler_math.transform(X_math_train)
X_math_test_scaled  = scaler_math.transform(X_math_test)

In [117]:
linr_ela = LinearRegression().fit(X_ela_train_scaled, y_ela_train)
linr_math = LinearRegression().fit(X_math_train_scaled, y_math_train)
tree_ela = DecisionTreeRegressor().fit(X_ela_train, y_ela_train)
tree_math = DecisionTreeRegressor().fit(X_math_train, y_math_train)
rf_ela = RandomForestRegressor(random_state=42)
rf_math = RandomForestRegressor(random_state=42)

In [118]:
ela_pred_linr = linr_ela.predict(X_ela_test_scaled)
math_pred_linr = linr_math.predict(X_math_test_scaled)
ela_pred_tree = tree_ela.predict(X_ela_test)
math_pred_tree = tree_math.predict(X_math_test)
rf_ela.fit(X_ela_train, y_ela_train)
rf_math.fit(X_math_train, y_math_train)

In [119]:
print("ELA Linear Regression:")
print("  RMSE:", round(mean_squared_error(y_ela_test, ela_pred_linr, squared=False), 3))
print("  R2:", round(r2_score(y_ela_test, ela_pred_linr), 3))

print("\nELA Decision Tree:")
print("  RMSE:", round(mean_squared_error(y_ela_test, ela_pred_tree, squared=False), 3))
print("  R2:", round(r2_score(y_ela_test, ela_pred_tree), 3))

# Evaluate Math
print("\nMath Linear Regression:")
print("  RMSE:", round(mean_squared_error(y_math_test, math_pred_linr, squared=False), 3))
print("  R2:", round(r2_score(y_math_test, math_pred_linr), 3))

print("\nMath Decision Tree:")
print("  RMSE:", round(mean_squared_error(y_math_test, math_pred_tree, squared=False), 3))
print("  R2:", round(r2_score(y_math_test, math_pred_tree), 3))

ELA Linear Regression:
  RMSE: 0.078
  R2: 0.585

ELA Decision Tree:
  RMSE: 0.084
  R2: 0.521

Math Linear Regression:
  RMSE: 0.093
  R2: 0.562

Math Decision Tree:
  RMSE: 0.116
  R2: 0.323


In [121]:
import numpy as np

# --- Sample 5 random indices from test set ---
np.random.seed(42)
indices = np.random.choice(X_ela_test.index, size=5, replace=False)

for i in indices:
    # Get row as 2D
    ela_input = X_ela_test.loc[[i]]
    math_input = X_math_test.loc[[i]]

    ela_input_scaled = scaler_ela.transform(ela_input)
    math_input_scaled = scaler_math.transform(math_input)

    # Actual values
    actual_ela = y_ela_test.loc[i]
    actual_math = y_math_test.loc[i]

    # Predictions
    pred_ela_linr = linr_ela.predict(ela_input_scaled)[0]
    pred_math_linr = linr_math.predict(math_input_scaled)[0]

    pred_ela_tree = tree_ela.predict(ela_input)[0]
    pred_math_tree = tree_math.predict(math_input)[0]

    pred_ela_rf = rf_ela.predict(ela_input)[0]
    pred_math_rf = rf_math.predict(math_input)[0]

    # Print results
    print(f"Row {i}")
    print(f"  Actual ELA:  {round(actual_ela * 100, 2)}%")
    print(f"    Linear:    {round(pred_ela_linr * 100, 2)}%")
    print(f"    Tree:      {round(pred_ela_tree * 100, 2)}%")
    print(f"    Forest:    {round(pred_ela_rf * 100, 2)}%")
    
    print(f"  Actual Math: {round(actual_math * 100, 2)}%")
    print(f"    Linear:    {round(pred_math_linr * 100, 2)}%")
    print(f"    Tree:      {round(pred_math_tree * 100, 2)}%")
    print(f"    Forest:    {round(pred_math_rf * 100, 2)}%")
    print("-" * 50)


Row 223
  Actual ELA:  34.1%
    Linear:    40.5%
    Tree:      44.26%
    Forest:    42.35%
  Actual Math: 38.21%
    Linear:    41.79%
    Tree:      40.68%
    Forest:    42.68%
--------------------------------------------------
Row 221
  Actual ELA:  45.34%
    Linear:    49.37%
    Tree:      58.23%
    Forest:    47.96%
  Actual Math: 40.86%
    Linear:    53.27%
    Tree:      51.47%
    Forest:    55.13%
--------------------------------------------------
Row 182
  Actual ELA:  43.06%
    Linear:    42.46%
    Tree:      44.26%
    Forest:    42.43%
  Actual Math: 45.61%
    Linear:    44.49%
    Tree:      47.82%
    Forest:    44.56%
--------------------------------------------------
Row 104
  Actual ELA:  47.99%
    Linear:    45.69%
    Tree:      39.64%
    Forest:    45.07%
  Actual Math: 48.82%
    Linear:    47.63%
    Tree:      44.91%
    Forest:    48.17%
--------------------------------------------------
Row 9
  Actual ELA:  29.41%
    Linear:    32.89%
    Tree:   

In [None]:
best_input = pd.DataFrame([{
    "Non-GCE Promote %": 0.02,
    "ELL %": 0.08,
    "Special Education %": 0.21,
    "FRPM_%": 0.18,
    "Chronically Absent Percent": 0.13,
    "AVG_ATT_RATE": 0.93,
    "Total Enrollment": 6000,
    "Corp Per/Student": 10500,
    "% Tested ELA": 0.70,
    "% Tested Math": 0.70,
    "Pct American Indian": 0.005,
    "Pct Asian": 0.025,
    "Pct Black": 0.07,
    "Pct Hispanic": 0.11,
    "Pct Multiracial": 0.045,
    "Pct Pacific Islander": 0.005,
    "Pct White": 0.735
    
}])[features]
best_input_scaled_ela = scaler_ela.transform(best_input)
best_input_scaled_math = scaler_math.transform(best_input)

ela_prediction = linr_ela.predict(best_input_scaled_ela)[0]
math_prediction = linr_math.predict(best_input_scaled_math)[0]


print("Predicted ELA Proficiency %:", round(ela_prediction * 100, 2))
print("Predicted Math Proficiency %:", round(math_prediction * 100, 2))

ela_pred_tree = tree_ela.predict(best_input)[0]
math_pred_tree = tree_math.predict(best_input)[0]

print("Decision Tree ELA Proficiency %:", round(ela_pred_tree * 100, 2))
print("Decision Tree Math Proficiency %:", round(math_pred_tree * 100, 2))


ela_pred_rf = rf_ela.predict(best_input)[0]
math_pred_rf = rf_math.predict(best_input)[0]
print("Random Forest ELA Prediction:", round(ela_pred_rf * 100, 2))
print("Random Forest Math Prediction:", round(math_pred_rf * 100, 2))

In [None]:
weak_input = pd.DataFrame([{
    "Non-GCE Promote %": 0.14,
    "ELL %": 0.18,  # ↑ higher ELL now
    "Special Education %": 0.24,
    "FRPM_%": 0.82,
    "Chronically Absent Percent": 0.28,
    "AVG_ATT_RATE": 0.86,
    "Total Enrollment": 1400,
    "Corp Per/Student": 7800,
    "% Tested ELA": 0.44,
    "% Tested Math": 0.42,
    "Pct American Indian": 0.005,
    "Pct Asian": 0.015,
    "Pct Black": 0.12,
    "Pct Hispanic": 0.20,
    "Pct Multiracial": 0.04,
    "Pct Pacific Islander": 0.005,
    "Pct White": 0.6 
}])[features]

weak_input_scaled_ela = scaler_ela.transform(weak_input)
weak_input_scaled_math = scaler_math.transform(weak_input)

ela_pred_weak = linr_ela.predict(weak_input_scaled_ela)[0]
math_pred_weak = linr_math.predict(weak_input_scaled_math)[0]


print("Predicted ELA Proficiency %:", round(ela_pred_weak * 100, 2))
print("Predicted Math Proficiency %:", round(math_pred_weak * 100, 2))

ela_pred_tree = tree_ela.predict(weak_input)[0]
math_pred_tree = tree_math.predict(weak_input)[0]

print("Decision Tree ELA Proficiency %:", round(ela_pred_tree * 100, 2))
print("Decision Tree Math Proficiency %:", round(math_pred_tree * 100, 2))

ela_pred_rf = rf_ela.predict(weak_input)[0]
math_pred_rf = rf_math.predict(weak_input)[0]
print("Random Forest ELA Prediction:", round(ela_pred_rf * 100, 2))
print("Random Forest Math Prediction:", round(math_pred_rf * 100, 2))

In [None]:
user_input = {}
print("Enter values for each feature:")
for feature in features:
    val = input(f"{feature}: ")
    try:
        user_input[feature] = float(val)
    except:
        print(f"Invalid value for {feature}, defaulting to 0")
        user_input[feature] = 0.0

# 3. Create input DataFrame
input_df = pd.DataFrame([user_input])[features]

# 4. Scale input for linear regression
input_scaled_ela = scaler_ela.transform(input_df)
input_scaled_math = scaler_math.transform(input_df)

# 5. Predict without bounding
pred_ela_linr = linr_ela.predict(input_scaled_ela)[0]
pred_math_linr = linr_math.predict(input_scaled_math)[0]

pred_ela_tree = tree_ela.predict(input_df)[0]
pred_math_tree = tree_math.predict(input_df)[0]

pred_ela_rf = rf_ela.predict(input_df)[0]
pred_math_rf = rf_math.predict(input_df)[0]

# 6. Display results
print("\nPredicted ELA Proficiency %:")
print(f"  Linear Regression:  {round(pred_ela_linr * 100, 2)}%")
print(f"  Decision Tree:      {round(pred_ela_tree * 100, 2)}%")
print(f"  Random Forest:      {round(pred_ela_rf * 100, 2)}%")

print("\nPredicted Math Proficiency %:")
print(f"  Linear Regression:  {round(pred_math_linr * 100, 2)}%")
print(f"  Decision Tree:      {round(pred_math_tree * 100, 2)}%")
print(f"  Random Forest:      {round(pred_math_rf * 100, 2)}%")