In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from math import sqrt

## **1- Load the datasets**

In [13]:
# I will keep the datasets seperate so i wont risk the leakage of the lables into the
# data and eventuly I will have to split them to train the sci-kit model

X_test_url = 'https://raw.githubusercontent.com/jazzbits/veritas/main/X_test.csv'
X_train_url = 'https://raw.githubusercontent.com/jazzbits/veritas/main/X_train.csv'
y_train_url = 'https://raw.githubusercontent.com/jazzbits/veritas/main/y_train.csv'

X_test = pd.read_csv(X_test_url, index_col=0)
X_train = pd.read_csv(X_train_url, index_col=0)
y_train = pd.read_csv(y_train_url, index_col=0)

## **2- Data Exploration/Preprocessing**

In [14]:
# Drop the Application.Deadline column. We can extract relevany information
# for example month, day and year but It does not hold predictive values
X_train = X_train.drop('Application.Deadline', axis=1)

# looks like data at index 119 is inconsistent so I will remove it
X_train = X_train.drop([119])
y_train = y_train.drop([119])

# It also looks like the Column 'Earn' has non numeric data. the errors='coerce'
# option will replace any non numeric data with 'nan'
X_train['Earn'] = pd.to_numeric(X_train['Earn'], errors='coerce')


## **3. Outlier Detection**

In [15]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ['Ownership', 'Citytype']


# for col in numerical_cols:
#     Q1 = X_train[col].quantile(0.25)
#     Q3 = X_train[col].quantile(0.75)
#     IQR = Q3 - Q1

#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR

#     outliers = X_train[(X_train[col] < lower_bound) | (X_train[col] > upper_bound)]

#     if not outliers.empty:
#         plt.figure(figsize=(10, 6))
#         plt.boxplot(X_train[col].dropna())
#         plt.title(f'Box Plot of {col}')
#         plt.ylabel(col)
#         plt.show()


# The following features have few outliers
# 'CrimeRate', 'FBI.CrimeRate', 'FBI.TotalCrime' has only few but
# 'Earn','Enrollment' there are lots of outliers.

# I will not remove the 'outliers' becasue it could be a variability or actual data especially
# we are dealing with Earnings and Enrollment. for example and Earnign of 120k is reasonable
# and cannot be removed from the data

# Also the 'Majors' features is supposed to be binary but i found instances of the number 2
# I will keep it as it might indicate a dual major

## **4. Impute missing values**

In [16]:
# Iam trying to get the count of the missing values in each column
nan_counts = X_train.isna().sum()

# Look like SAT and ACT are missing a lot of and almost everyone
# who missed the SAT missed the ACT.


columns_to_impute = ['SAT', 'ACT', 'Earn', 'AvgCost', 'ADMrate']
for col in columns_to_impute:
    X_train[col] = SimpleImputer(strategy='mean').fit_transform(X_train[[col]])

# correlation_matrix = X_train[numerical_cols].corr()
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
# plt.title("Correlation Matrix")
# plt.show()

# ACT and SAT are highly corrolated even after nan imputation.
# I will drop the ACT since it has more missing values
X_train = X_train.drop('ACT', axis=1)



## **5. One hot enconding**

In [17]:
# The dataset contains categorical columns
X_train = pd.get_dummies(X_train, columns=categorical_cols)

## **6. Model selection and fine tuning:**

In [18]:
# I will use  Random forest
model = RandomForestRegressor(random_state=42)

# I will use gridsearch in order to fine tune the hyperparameters for the Regressor
# and I will combine it with cross validation so i can evaluate the performace of the model

# 1- Define the hyperparameter values for the grid
grid_param = {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
              }

# 2- Configure the grid search
grid_search = GridSearchCV( estimator = model,  # RandomForest
			                      param_grid = grid_param,  # this is the grid of parameters I defined above
                            cv = 3, #Cross validation with 3 folds
			                      n_jobs = -1, # use all CPU cores
                            verbose = 0, # how much information gets printed out - 0 = moderate
                            scoring = 'neg_mean_squared_error' # Method used for evelualting the model performace
                           )

# 3- computation
grid_search.fit(X_train, y_train.values.ravel())

# 4- print the best parameters
print("The Best parameters from grid search:", grid_search.best_params_)

# 5-  I will use the model fine tuned with the best parameters from grid search
best_model = grid_search.best_estimator_

The Best parameters from grid search: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


## **7. Model Performace Evaluation**




In [19]:
# 1- Get the model predicted values on the traing dataset
predictions = best_model.predict(X_train)

# 2- Get the mean squared error
mean_sq_err = mean_squared_error(y_train, predictions)
print("Mean Squared Error of the best model:", mean_sq_err)

# 3- Get the root mean squared error
root_mean_sq_err = sqrt(mean_sq_err)
print("       Root Mean Squared Error Score:", root_mean_sq_err)

# 4- Get the R² Score
r_square = r2_score(y_train, predictions)
print("                        The R² Score:", r_square)


Mean Squared Error of the best model: 4.066657940945877
       Root Mean Squared Error Score: 2.0165956314903286
                        The R² Score: 0.9028603927248733


## **8. Start prediction on the test dataset**

In [20]:
X_test = X_test.drop('Application.Deadline', axis=1)
X_test = X_test.drop('ACT', axis=1)
columns_to_impute = ['SAT', 'Earn', 'AvgCost', 'ADMrate','Enrollment']

for col in columns_to_impute:
    X_test[col] = SimpleImputer(strategy='mean').fit_transform(X_test[[col]])

X_test = pd.get_dummies(X_test, columns=categorical_cols)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [21]:
test_predictions = best_model.predict(X_test)
predictions_df = pd.DataFrame(test_predictions, index=X_test.index, columns=['prediction'])
print(predictions_df)

     prediction
14    52.528588
109   56.933520
159   44.331564
162   56.310250
253   52.385339
..          ...
98    48.006835
142   52.434833
62    54.330706
203   59.432126
33    52.558417

[90 rows x 1 columns]


In [22]:
# from google.colab import drive
# drive.mount('/content/drive')
# path = '/content/drive/My Drive/Colab Notebooks/Veritas/Adam_Farhat_14_DubaiAmericanAcademy.csv'
# predictions_df.to_csv(path, index=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
