In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
np.random.seed(42)

# Number of rows
num_rows = 10000

# Generating dummy data for each feature
data = {
    'college_name': [f'College_{i}' for i in range(num_rows)],
    'academic_reputation': np.random.uniform(1, 5, num_rows),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, num_rows),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, num_rows),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, num_rows),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, num_rows),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, num_rows),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, num_rows),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, num_rows),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, num_rows), # Percentage 0-100
    # Assuming we have some ground truth scores for training
    'true_scores': np.random.uniform(0, 100, num_rows)          # Scale from 0 to 100
}

# Create DataFrame
df = pd.DataFrame(data)

# Split the data into training and testing sets
X = df.drop(columns=['college_name', 'true_scores'])
y = df['true_scores']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Initialize RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


Mean Absolute Error (MAE): 25.059787662110686
Root Mean Squared Error (RMSE): 29.11387279803502




In [25]:
# Assuming 'scaler' and 'model' are already defined and trained as per the previous steps

# 1. Drop unnecessary columns for prediction
df_features = df.drop(columns=['college_name', 'true_scores'])

# 2. Transform features using the fitted scaler
df_features_scaled = scaler.transform(df_features)

# 3. Predict using the trained model
df['predicted_score'] = model.predict(df_features_scaled)

# 4. Sort colleges by predicted score in descending order
ranked_colleges = df.sort_values(by='predicted_score', ascending=False)

# 5. Add a ranking column
ranked_colleges['rank'] = range(1, len(ranked_colleges) + 1)

# 6. Display the top 10 colleges
print(ranked_colleges.head(10))




ValueError: X has 10 features, but MinMaxScaler is expecting 9 features as input.

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set seed for reproducibility
np.random.seed(42)

# Number of rows
num_rows = 10000

# Generating dummy data for each feature
data = {
    'college_name': [f'College_{i}' for i in range(num_rows)],
    'academic_reputation': np.random.uniform(1, 5, num_rows),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, num_rows),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, num_rows),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, num_rows),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, num_rows),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, num_rows),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, num_rows),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, num_rows),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, num_rows), # Percentage 0-100
    # Assuming we have some ground truth scores for training
    'true_scores': np.random.uniform(0, 100, num_rows)          # Scale from 0 to 100
}

# Create DataFrame
df = pd.DataFrame(data)

# Split the data into training and testing sets
X = df.drop(columns=['college_name', 'true_scores'])
y = df['true_scores']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform on training data
X_train_scaled = scaler.fit_transform(X_train)

# Initialize RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

# Transform the entire dataset using the fitted scaler
X_scaled = scaler.transform(X)

# Predict scores for the entire dataset
df['predicted_score'] = model.predict(X_scaled)

# Sort colleges by predicted score in descending order
ranked_colleges = df.sort_values(by='predicted_score', ascending=False)

# Add a ranking column
ranked_colleges['rank'] = range(1, len(ranked_colleges) + 1)

# Display the top 10 colleges
print(ranked_colleges.head(10))


      college_name  academic_reputation  graduation_rates  retention_rates  \
5525  College_5525             1.137507         13.043945         6.144792   
5166  College_5166             1.836389          4.734998        72.880620   
8347  College_8347             1.131302         77.858185        90.528494   
9540  College_9540             1.746932         92.613284        32.329210   
1930  College_1930             3.319935         68.984128        71.638656   
8092  College_8092             3.343452         12.780584        85.936160   
1468  College_1468             4.483014         21.237066        72.887029   
1826  College_1826             1.167805         33.971454        52.324424   
5154  College_5154             3.705593         50.392901        53.854544   
1635  College_1635             1.938568         38.110933         6.684722   

      faculty_resources  student_to_faculty_ratio  research_output  \
5525           7.537028                 23.720753        48.125396   
5

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set seed for reproducibility
np.random.seed(42)

# Number of rows
num_rows = 10000

# Generating dummy data for each feature
data = {
    'college_name': [f'College_{i}' for i in range(num_rows)],
    'academic_reputation': np.random.uniform(1, 5, num_rows),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, num_rows),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, num_rows),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, num_rows),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, num_rows),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, num_rows),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, num_rows),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, num_rows),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, num_rows), # Percentage 0-100
    # Assuming we have some ground truth scores for training
    'true_scores': np.random.uniform(0, 100, num_rows)          # Scale from 0 to 100
}

# Create DataFrame
df = pd.DataFrame(data)

# Split the data into training and testing sets
X = df.drop(columns=['college_name', 'true_scores'])
y = df['true_scores']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform on training data
X_train_scaled = scaler.fit_transform(X_train)

# Initialize RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

# Transform the entire dataset using the fitted scaler
X_scaled = scaler.transform(X)

# Predict scores for the entire dataset
df['predicted_score'] = model.predict(X_scaled)

# Sort colleges by predicted score in descending order
ranked_colleges = df.sort_values(by='predicted_score', ascending=False)

# Add a ranking column
ranked_colleges['rank'] = range(1, len(ranked_colleges) + 1)

# Display only college_name and rank for the top 10 colleges
top_colleges = ranked_colleges[['college_name', 'rank']].head(10)
print(top_colleges)


      college_name  rank
5525  College_5525     1
5166  College_5166     2
8347  College_8347     3
9540  College_9540     4
1930  College_1930     5
8092  College_8092     6
1468  College_1468     7
1826  College_1826     8
5154  College_5154     9
1635  College_1635    10


In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Set seed for reproducibility
np.random.seed(42)

# Number of rows
num_rows = 10000

# Generating dummy data for each feature
data = {
    'college_name': [f'College_{i}' for i in range(num_rows)],
    'academic_reputation': np.random.uniform(1, 5, num_rows),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, num_rows),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, num_rows),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, num_rows),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, num_rows),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, num_rows),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, num_rows),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, num_rows),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, num_rows), # Percentage 0-100
    # Assuming we have some ground truth scores for training
    'true_scores': np.random.uniform(0, 100, num_rows)          # Scale from 0 to 100
}

# Create DataFrame
df = pd.DataFrame(data)

# Split the data into training and testing sets
X = df.drop(columns=['college_name', 'true_scores'])
y = df['true_scores']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform on training data
X_train_scaled = scaler.fit_transform(X_train)

# Initialize RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

# Transform the entire dataset using the fitted scaler
X_scaled = scaler.transform(X)

# Predict scores for the entire dataset
df['predicted_score'] = model.predict(X_scaled)

# Sort colleges by predicted score in descending order
ranked_colleges = df.sort_values(by='predicted_score', ascending=False)

# Add a ranking column
ranked_colleges['rank'] = range(1, len(ranked_colleges) + 1)

# Display only college_name and rank for the top 10 colleges
top_colleges = ranked_colleges[['college_name', 'rank']].head(10)
print(top_colleges)

# Testing new data (10 colleges in your city)
new_data = {
    'college_name': ['New_College_1', 'New_College_2', 'New_College_3', 'New_College_4', 'New_College_5',
                     'New_College_6', 'New_College_7', 'New_College_8', 'New_College_9', 'New_College_10'],
    'academic_reputation': np.random.uniform(1, 5, 10),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, 10),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, 10),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, 10),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, 10),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, 10),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, 10),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, 10),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, 10)   # Percentage 0-100
}

# Create DataFrame for new data
df_new = pd.DataFrame(new_data)

# Prepare new data for prediction (drop 'college_name')
X_new = df_new.drop(columns=['college_name'])

# Transform new data using the fitted scaler
X_new_scaled = scaler.transform(X_new)

# Predict scores for new data
df_new['predicted_score'] = model.predict(X_new_scaled)

# Sort new colleges by predicted score in descending order
ranked_new_colleges = df_new.sort_values(by='predicted_score', ascending=False)

# Add a ranking column for new colleges
ranked_new_colleges['rank'] = range(1, len(ranked_new_colleges) + 1)

# Display only college_name and rank for the new colleges
print("\nTop 10 Colleges in Your City (Predicted):\n")
print(ranked_new_colleges[['college_name', 'rank']].head(10))


      college_name  rank
5525  College_5525     1
5166  College_5166     2
8347  College_8347     3
9540  College_9540     4
1930  College_1930     5
8092  College_8092     6
1468  College_1468     7
1826  College_1826     8
5154  College_5154     9
1635  College_1635    10

Top 10 Colleges in Your City (Predicted):

     college_name  rank
2   New_College_3     1
4   New_College_5     2
3   New_College_4     3
5   New_College_6     4
6   New_College_7     5
9  New_College_10     6
8   New_College_9     7
1   New_College_2     8
7   New_College_8     9
0   New_College_1    10


In [29]:
import pandas as pd
import numpy as np

# Assuming you have already defined 'data' and created 'df' as in your example

# Define the number of rows
num_rows = 10000

# Generating dummy data for each feature
data = {
    'college_name': [f'College_{i}' for i in range(num_rows)],
    'academic_reputation': np.random.uniform(1, 5, num_rows),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, num_rows),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, num_rows),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, num_rows),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, num_rows),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, num_rows),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, num_rows),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, num_rows),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, num_rows), # Percentage 0-100
    'true_scores': np.random.uniform(0, 100, num_rows)          # Scale from 0 to 100
}

# Create DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV file
df.to_csv('dummy_college_data.csv', index=False)

print("DataFrame saved to CSV successfully.")


DataFrame saved to CSV successfully.


In [32]:
import pandas as pd
import numpy as np

# Define the number of rows
num_rows = 10  # Adjusted to 10 for 10 colleges

# Generating dummy data for each feature
data = {
    'college_name': [f'College_{i}' for i in range(num_rows)],
    'academic_reputation': np.random.uniform(1, 5, num_rows),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, num_rows),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, num_rows),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, num_rows),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, num_rows),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, num_rows),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, num_rows),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, num_rows),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, num_rows), # Percentage 0-100
    'true_scores': np.random.uniform(0, 100, num_rows)          # Scale from 0 to 100
}

# Create DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV file
df.to_csv('dummy_college_data_10.csv', index=False)

print("DataFrame saved to CSV successfully.")


DataFrame saved to CSV successfully.


In [30]:
import pandas as pd
import numpy as np

# Define new data dictionary
new_data = {
    'college_name': ['New_College_1', 'New_College_2', 'New_College_3', 'New_College_4', 'New_College_5',
                     'New_College_6', 'New_College_7', 'New_College_8', 'New_College_9', 'New_College_10'],
    'academic_reputation': np.random.uniform(1, 5, 10),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, 10),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, 10),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, 10),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, 10),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, 10),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, 10),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, 10),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, 10)   # Percentage 0-100
}

# Create DataFrame for new data
df_new = pd.DataFrame(new_data)

# Save DataFrame to CSV file
df_new.to_csv('new_college_data.csv', index=False)

print("New DataFrame saved to CSV successfully.")


New DataFrame saved to CSV successfully.


In [31]:


import pandas as pd
import numpy as np

# Define new data dictionary including true_scores
new_data = {
    'college_name': ['New_College_1', 'New_College_2', 'New_College_3', 'New_College_4', 'New_College_5',
                     'New_College_6', 'New_College_7', 'New_College_8', 'New_College_9', 'New_College_10'],
    'academic_reputation': np.random.uniform(1, 5, 10),   # Scale from 1 to 5
    'graduation_rates': np.random.uniform(0, 100, 10),    # Percentage 0-100
    'retention_rates': np.random.uniform(0, 100, 10),     # Percentage 0-100
    'faculty_resources': np.random.uniform(1, 10, 10),    # Scale from 1 to 10
    'student_to_faculty_ratio': np.random.uniform(5, 50, 10),  # Ratio 5-50
    'research_output': np.random.uniform(1, 100, 10),     # Scale from 1 to 100
    'financial_resources': np.random.uniform(1e6, 1e8, 10),   # Scale from 1 million to 100 million
    'student_satisfaction': np.random.uniform(1, 5, 10),  # Scale from 1 to 5
    'employment_outcomes': np.random.uniform(0, 100, 10),  # Percentage 0-100
    'true_scores': np.random.uniform(0, 100, 10)          # Scale from 0 to 100
}

# Create DataFrame for new data
df_new = pd.DataFrame(new_data)

# Save DataFrame to CSV file
df_new.to_csv('utu.csv', index=False)

print("New DataFrame saved to CSV successfully.")


New DataFrame saved to CSV successfully.
