In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder



from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [None]:

data = pd.read_csv('/content/GD_Python.csv')

print(data.head())


                               Academic Plan  Bachelors  Masters  Doctorates  \
0                  Applied Linguistics (PHD)          0        0           1   
1               Art Education - Major (BSED)          4        0           0   
2  Comparative Cultural Studies - Major (BA)          6        0           0   
3                  English - Literature (MA)          0        2           0   
4                       English - Major (BA)         13        0           0   

                  College Name Semester  Year Semester/Year  Grad or not  
0  College of Arts and Letters     Fall  2023     Fall 2023            1  
1  College of Arts and Letters     Fall  2023     Fall 2023            1  
2  College of Arts and Letters     Fall  2023     Fall 2023            1  
3  College of Arts and Letters     Fall  2023     Fall 2023            1  
4  College of Arts and Letters     Fall  2023     Fall 2023            1  


In [None]:
print("Data Description:")
print(data.describe(include='all'))

Data Description:
                           Academic Plan    Bachelors      Masters  \
count                               2174  2174.000000  2174.000000   
unique                               269          NaN          NaN   
top     University Studies - Major (BUS)          NaN          NaN   
freq                                  12          NaN          NaN   
mean                                 NaN    12.873965     2.768629   
std                                  NaN    27.203647     7.815245   
min                                  NaN     0.000000     0.000000   
25%                                  NaN     0.000000     0.000000   
50%                                  NaN     2.000000     0.000000   
75%                                  NaN    12.000000     2.000000   
max                                  NaN   222.000000   125.000000   

         Doctorates                               College Name Semester  \
count   2174.000000                                       2174    

In [None]:

# Group by 'Semester/Year' and sum the counts for Bachelors, Masters, and Doctorates
degree_totals = data.groupby('Semester/Year')[['Bachelors', 'Masters', 'Doctorates']].sum().reset_index()

# Display the results
print(degree_totals)


   Semester/Year  Bachelors  Masters  Doctorates
0      Fall 2018       1494      346         104
1      Fall 2019       1468      347         115
2      Fall 2020       1427      316          17
3      Fall 2021       1402      361         120
4      Fall 2022       1276      381         119
5      Fall 2023       1106      474         116
6    Spring 2018       3233      540          62
7    Spring 2019       3291      592          86
8    Spring 2020       3440      627          88
9    Spring 2021       3467      629         185
10   Spring 2022       3324      638          87
11   Spring 2023       3060      768         109


In [None]:
bachelor_data = data.groupby(['Semester/Year'])['Bachelors'].sum().reset_index()
masters_data = data.groupby(['Semester/Year'])['Masters'].sum().reset_index()
doctorate_data = data.groupby(['Semester/Year'])['Doctorates'].sum().reset_index()

In [None]:
def predict_graduates(grad_data, degree_type):
    grad_data['Time'] = np.arange(len(grad_data))
    X = grad_data[['Time']]  # Feature: Time index
    y = grad_data[degree_type]  # Target: Sum of graduates

    # Create and fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Predict for the existing data and for the next time index (e.g., Fall 2024)
    y_pred = model.predict(X)  # Predictions for existing data
    future_time = np.array([[len(grad_data)]])  # Next time index
    predicted_grads = model.predict(future_time)

    # Calculate accuracy metrics
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)

    return predicted_grads[0], r_squared, mae, mse


In [None]:
predicted_bachelors, r2_bachelors, mae_bachelors, mse_bachelors = predict_graduates(bachelor_data, 'Bachelors')
predicted_masters, r2_masters, mae_masters, mse_masters = predict_graduates(masters_data, 'Masters')
predicted_doctorates, r2_doctorates, mae_doctorates, mse_doctorates = predict_graduates(doctorate_data, 'Doctorates')



In [None]:
print(f"Predicted number of Bachelors for Fall 2024: {predicted_bachelors:.2f}, R-squared: {r2_bachelors:.2f}, MAE: {mae_bachelors:.2f}, MSE: {mse_bachelors:.2f}")
print(f"Predicted number of Masters for Fall 2024: {predicted_masters:.2f}, R-squared: {r2_masters:.2f}, MAE: {mae_masters:.2f}, MSE: {mse_masters:.2f}")
print(f"Predicted number of Doctorates for Fall 2024: {predicted_doctorates:.2f}, R-squared: {r2_doctorates:.2f}, MAE: {mae_doctorates:.2f}, MSE: {mse_doctorates:.2f}")

Predicted number of Bachelors for Fall 2024: 3845.33, R-squared: 0.67, MAE: 477.42, MSE: 313694.84
Predicted number of Masters for Fall 2024: 762.52, R-squared: 0.93, MAE: 35.08, MSE: 1530.66
Predicted number of Doctorates for Fall 2024: 115.80, R-squared: 0.04, MAE: 30.10, MSE: 1385.43


In [None]:

bachelor_data = data.groupby(['Semester/Year'])['Bachelors'].sum().reset_index()
masters_data = data.groupby(['Semester/Year'])['Masters'].sum().reset_index()
doctorate_data = data.groupby(['Semester/Year'])['Doctorates'].sum().reset_index()


In [None]:
def predict_graduates_with_random_forest(grad_data, degree_type):
    # Prepare the data for regression
    grad_data['Time'] = np.arange(len(grad_data))  # Create a time index for regression
    X = grad_data[['Time']]  # Feature: Time index
    y = grad_data[degree_type]  # Target: Sum of graduates

    # Create and fit the Random Forest regression model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)

    # Predict for existing data and for the next time index (e.g., Fall 2024)
    y_pred = model.predict(X)  # Predictions for existing data
    future_time = np.array([[len(grad_data)]])  # Next time index
    predicted_grads = model.predict(future_time)

    # Calculate accuracy metrics
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)

    return predicted_grads[0], r_squared, mae, mse


In [None]:
predicted_bachelors, r2_bachelors, mae_bachelors, mse_bachelors = predict_graduates_with_random_forest(bachelor_data, 'Bachelors')
predicted_masters, r2_masters, mae_masters, mse_masters = predict_graduates_with_random_forest(masters_data, 'Masters')
predicted_doctorates, r2_doctorates, mae_doctorates, mse_doctorates = predict_graduates_with_random_forest(doctorate_data, 'Doctorates')




In [None]:

# Output the predicted values and accuracy metrics
print(f"Predicted number of Bachelors for Spring 2024: {predicted_bachelors:.2f}, R-squared: {r2_bachelors:.2f}, MAE: {mae_bachelors:.2f}, MSE: {mse_bachelors:.2f}")
print(f"Predicted number of Masters for Spring 2024: {predicted_masters:.2f}, R-squared: {r2_masters:.2f}, MAE: {mae_masters:.2f}, MSE: {mse_masters:.2f}")
print(f"Predicted number of Doctorates for Spring 2024: {predicted_doctorates:.2f}, R-squared: {r2_doctorates:.2f}, MAE: {mae_doctorates:.2f}, MSE: {mse_doctorates:.2f}")


Predicted number of Bachelors for Spring 2024: 3845.33, R-squared: 0.67, MAE: 477.42, MSE: 313694.84
Predicted number of Masters for Spring 2024: 762.52, R-squared: 0.93, MAE: 35.08, MSE: 1530.66
Predicted number of Doctorates for Spring 2024: 115.80, R-squared: 0.04, MAE: 30.10, MSE: 1385.43


In [None]:
college_data = data.groupby('College Name').agg({
    'Bachelors': 'sum',
    'Masters': 'sum',
    'Doctorates': 'sum'
}).reset_index()

In [None]:

# Prepare data for prediction
college_data['Time'] = np.arange(len(college_data))  # Create a time index for regression


In [None]:
def predict_graduates_with_random_forest(college_data, degree_type):
    # Feature and target variable
    X = college_data[['Time']]  # Feature: Time index
    y = college_data[degree_type]  # Target: Sum of graduates

    # Create and fit the Random Forest regression model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)

    # Predict for Spring 2025 (next time index)
    future_time = np.array([[len(college_data)]])  # Next time index
    predicted_grads = model.predict(future_time)

    # Calculate accuracy metrics
    y_pred = model.predict(X)  # Predictions for existing data
    r_squared = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)

    return predicted_grads[0], r_squared, mae, mse


In [None]:
predicted_bachelors, r2_bachelors, mae_bachelors, mse_bachelors = predict_graduates_with_random_forest(college_data, 'Bachelors')
predicted_masters, r2_masters, mae_masters, mse_masters = predict_graduates_with_random_forest(college_data, 'Masters')
predicted_doctorates, r2_doctorates, mae_doctorates, mse_doctorates = predict_graduates_with_random_forest(college_data, 'Doctorates')




In [None]:
print("Random Forest Regressor")
# Output the predicted values and accuracy metrics
print(f"Predicted number of Bachelors for Spring 2025: {predicted_bachelors:.2f}, R-squared: {r2_bachelors:.2f}, MAE: {mae_bachelors:.2f}, MSE: {mse_bachelors:.2f}")
print(f"Predicted number of Masters for Spring 2025: {predicted_masters:.2f}, R-squared: {r2_masters:.2f}, MAE: {mae_masters:.2f}, MSE: {mse_masters:.2f}")
print(f"Predicted number of Doctorates for Spring 2025: {predicted_doctorates:.2f}, R-squared: {r2_doctorates:.2f}, MAE: {mae_doctorates:.2f}, MSE: {mse_doctorates:.2f}")


Random Forest Regressor
Predicted number of Bachelors for Spring 2025: 4300.72, R-squared: 0.87, MAE: 444.67, MSE: 397371.65
Predicted number of Masters for Spring 2025: 306.83, R-squared: 0.75, MAE: 312.89, MSE: 182481.62
Predicted number of Doctorates for Spring 2025: 18.58, R-squared: 0.77, MAE: 92.09, MSE: 15129.87
