#  Graduate Admissions

## Data pre-processing and exploratory analysis

In [None]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
%matplotlib inline
warnings.filterwarnings('ignore')

### Importing data

In [None]:
data_source = "./data/Admission_Predict_Ver1.1.csv"
df = pd.read_csv(data_source)

### Getting to know data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

### Features
- gre_score
- toefl_score
- university_rating
- sop
- lor
- cgpa
- research

### Target
- chance_of_admit

In [None]:
# Renaming columns
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.replace(" ", "_")
df = df.rename(index=str, columns={"lor_": "lor", "chance_of_admit_": "chance_of_admit"});

In [None]:
#Dropping "serial_no." as it is not needed
df = df.drop(columns = ['serial_no.'])

In [None]:
df.head()

In [None]:
df.to_pickle(r'data/df.pkl')

## Exploratory data analysis

### Using a pairplot to visualize distribution of data and correlation among columns
- As we can see - toefl_score, gre_score and cgpa have linear relationship among themselves and with target chance_of_admit
- That means people who scored higher in gre, also scored higher in toefl and vice versa
- People with higher cgpa score higher in gre and toefl
- higher gre & toelf score means higher chance of admit

In [None]:
# sns.pairplot(df, hue = 'chance_of_admit') # uncommented to speed up compiling

### Correlation matrix
- Darker squares means higher correlation
- Best case scenario is having  many features having higher correlation with target, but lower correlation among themselves

In [None]:
corr = df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(df.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(df.columns)
ax.set_yticklabels(df.columns)
plt.show()

## Train-Test Split

Split data to independent variables matrix (X) and dependent variable vector (y)

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, 7].values

Split data 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

A couple of utility functions to plot the results from each model.

In [None]:
def plot_histogram(y_true,y_pred):
    plt.figure(figsize=(12,4))
    y_true.hist(bins=22, normed=True)
    y_pred.hist(bins=22, normed=True, alpha=0.5)
    
    
def plot_best_fit(y_true, y_pred, model_name):
    plt.figure(figsize=(12,8))
#     reorder = y_true.sort.index.tolist()
#     x_series = np.array(y_true[reorder])
#     y_series = np.array(y_pred[reorder])    
    x_series = np.array(y_true)
    y_series = np.array(y_pred)
    
    # Create scatter plot of 
    plt.xlim(0,1.1)
    plt.ylim(0,1.1)
    plt.scatter(x_series, y_series, s=30, alpha=0.2, marker='o')

    # Create line of best fit and confidence intervals
    par = np.polyfit(x_series, y_series, 1, full=True)
    slope=par[0][0]
    intercept=par[0][1]
    xl = [min(x_series), max(x_series)]
    yl = [slope*x + intercept for x in xl]

    # Determine error bounds
    yerr = [abs(slope*xx + intercept - yy) for xx,yy in zip(x_series,y_series)]
    par = np.polyfit(x_series, yerr, 2, full=True)
    yerrUpper = [(xx*slope+intercept)+(par[0][0]*xx**2 + par[0][1]*xx + par[0][2]) for xx,yy in zip(x_series,y_series)]
    yerrLower = [(xx*slope+intercept)-(par[0][0]*xx**2 + par[0][1]*xx + par[0][2]) for xx,yy in zip(x_series,y_series)]

    plt.plot(xl, yl, 'r')
    plt.plot(x_series, yerrLower, '--r')
    plt.plot(x_series, yerrUpper, '--r')

    plt.xlabel("Actual Admissions")
    plt.ylabel("{} Predictoins".format(model_name))    

# Modeling


## Linear Regression

In [None]:
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

X_train_lin_reg = X_train
y_train_lin_reg = y_train

Principal function that outputs the final optimized theta

In [None]:
def linear_regression(X, y, alpha, num_iters):
    n = X.shape[1]
    one_column = np.ones((X.shape[0],1))
    X = np.concatenate((one_column, X), axis = 1)
    # initializing the parameter vector...
    theta = np.zeros(n+1)
    # hypothesis calculation....
    h = hypothesis(theta, X, n)
    # returning the optimized parameters by Gradient Descent...
    theta, cost = gradient_descent(theta,alpha,num_iters,h,X,y,n)
    return theta, cost

Function that calculates and outputs the hypothesis value of the target variable

In [None]:
def hypothesis(theta, X, n):
    h = np.ones((X.shape[0],1))
    theta = theta.reshape(1,n+1)
    for i in range(0,X.shape[0]):
        h[i] = float(np.matmul(theta, X[i]))
    h = h.reshape(X.shape[0])
    return h

Function that performs the gradient descent algorithm

In [None]:
def gradient_descent(theta, alpha, num_iters, h, X, y, n):
    cost = np.ones(num_iters)
    for i in range(0,num_iters):
        theta[0] = theta[0] - (alpha/X.shape[0]) * sum(h - y)
        for j in range(1,n+1):
            theta[j] = theta[j] - (alpha/X.shape[0]) * sum((h-y) * X.transpose()[j])
        h = hypothesis(theta, X, n)
        cost[i] = (1/X.shape[0]) * 0.5 * sum(np.square(h - y))
    theta = theta.reshape(1,n+1)
    return theta, cost

Feature scaling

In [None]:
mean_train = np.ones(X_train_lin_reg.shape[1])
std_train = np.ones(X_train_lin_reg.shape[1])
for i in range(0, X_train_lin_reg.shape[1]):
    mean_train[i] = np.mean(X_train_lin_reg.transpose()[i])
    std_train[i] = np.std(X_train_lin_reg.transpose()[i])
    for j in range(0, X_train_lin_reg.shape[0]):
        X_train_lin_reg[j][i] = (X_train_lin_reg[j][i] - mean_train[i])/std_train[i]   
        
mean_test = np.ones(X_test.shape[1])
std_test = np.ones(X_test.shape[1])
for i in range(0, X_test.shape[1]):
    mean_test[i] = np.mean(X_test.transpose()[i])
    std_test[i] = np.std(X_test.transpose()[i])
    for j in range(0, X_test.shape[0]):
        X_test[j][i] = (X_test[j][i] - mean_test[i])/std_test[i]      

Calling the principal function with learning_rate = 0.01

In [None]:
num_iters = 1000
theta, cost = linear_regression(X_train_lin_reg, y_train_lin_reg, 0.01, num_iters)

The reduction in the cost 

In [None]:
import matplotlib.pyplot as plt
cost = list(cost)
n_iterations = [x for x in range(1,num_iters+1)]
plt.plot(n_iterations, cost)
plt.xlabel('No. of iterations')
plt.ylabel('Cost')

Predictions:

In [None]:
X_test_lin_reg = np.concatenate((np.ones((X_test.shape[0],1)), X_test) ,axis = 1)
predictions_lin_reg = hypothesis(theta, X_test_lin_reg, X_test_lin_reg.shape[1] - 1)

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test, predictions_lin_reg)))

Plot predictions and test data 

In [None]:
plt.plot(y_test.ravel(), '-', predictions_lin_reg, '-')
plt.title('Linear regression')
plt.xlabel('Number of Test Samples') 
plt.ylabel('Chance of Admission')

Test accurency:

In [None]:
output_error = y_test - predictions_lin_reg

error_lin_reg = np.mean(np.abs(output_error))
accuracy_lin_reg = (1 - error_lin_reg) * 100

print("Test Accuracy " + str(round(accuracy_lin_reg,2)) + "%")

Save results to csv file

In [None]:
output_col_order = ['ChanceOfAdmit', 'Predicted', 'Delta']
result_lin_reg = pd.DataFrame({'ChanceOfAdmit':y_test,
                               'Predicted':predictions_lin_reg,
                               'Delta': abs(y_test - predictions_lin_reg)})
                        
result_lin_reg[output_col_order].to_csv('results/LinearRegression.csv', index=False)

### Feature engineering # 1

We chose 'gre score', 'toefl score', and 'cgpa' columns that seemed the most important in getting admitted

In [None]:
X_lin_reg_fe1 = df[['gre_score', 'toefl_score', 'cgpa']]
X_lin_reg_fe1 = X_lin_reg_fe1.iloc[:,:].values
y_lin_reg_fe1 = y

Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train_lin_reg_fe1, X_test_lin_reg_fe1, y_train_lin_reg_fe1, y_test_lin_reg_fe1 = train_test_split(X_lin_reg_fe1, y_lin_reg_fe1, test_size=0.2, random_state = 42)

Feature scaling

In [None]:
mean_train_fe1 = np.ones(X_train_lin_reg_fe1.shape[1])
std_train_fe1 = np.ones(X_train_lin_reg_fe1.shape[1])
for i in range(0, X_train_lin_reg_fe1.shape[1]):
    mean_train_fe1[i] = np.mean(X_train_lin_reg_fe1.transpose()[i])
    std_train_fe1[i] = np.std(X_train_lin_reg_fe1.transpose()[i])
    for j in range(0, X_train_lin_reg_fe1.shape[0]):
        X_train_lin_reg_fe1[j][i] = (X_train_lin_reg_fe1[j][i] - mean_train_fe1[i])/std_train_fe1[i]
        
mean_test_fe1 = np.ones(X_test_lin_reg_fe1.shape[1])
std_test = np.ones(X_test_lin_reg_fe1.shape[1])
for i in range(0, X_test_lin_reg_fe1.shape[1]):
    mean_test[i] = np.mean(X_test_lin_reg_fe1.transpose()[i])
    std_test[i] = np.std(X_test_lin_reg_fe1.transpose()[i])
    for j in range(0, X_test_lin_reg_fe1.shape[0]):
        X_test_lin_reg_fe1[j][i] = (X_test_lin_reg_fe1[j][i] - mean_test[i])/std_test[i]

Calling the principal function with learning_rate = 0.01

In [None]:
num_iters = 1000
theta_reg_fe1, cost_reg_fe1 = linear_regression(X_train_lin_reg_fe1, y_train_lin_reg_fe1, 0.01, num_iters)

Predictions:

In [None]:
X_test_lin_reg_fe1 = np.concatenate((np.ones((X_test_lin_reg_fe1.shape[0],1)), X_test_lin_reg_fe1) ,axis = 1)
predictions_lin_reg_fe1 = hypothesis(theta_reg_fe1, X_test_lin_reg_fe1, X_test_lin_reg_fe1.shape[1] - 1)

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test, predictions_lin_reg_fe1)))

In [None]:
plt.plot(y_test.ravel(), '-', predictions_lin_reg_fe1, '-')
plt.title('Linear regression')
plt.xlabel('Number of Test Samples') 
plt.ylabel('Chance of Admission')

In [None]:
output_error_fe1 = y_test - predictions_lin_reg_fe1

error_lin_reg_fe1 = np.mean(np.abs(output_error_fe1))
accuracy_lin_reg_fe1 = (1 - error_lin_reg_fe1) * 100

print("Test Accuracy " + str(round(accuracy_lin_reg_fe1,2)) + "%")

### Feature engineering # 2

We chose only 'gre score', and 'cgpa' columns that seemed the most important in getting admitted

In [None]:
X_lin_reg_fe2 = df[['gre_score', 'cgpa']]
X_lin_reg_fe2 = X_lin_reg_fe2.iloc[:,:].values
y_lin_reg_fe2 = y

Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train_lin_reg_fe2, X_test_lin_reg_fe2, y_train_lin_reg_fe2, y_test_lin_reg_fe2 = train_test_split(X_lin_reg_fe2, y_lin_reg_fe2, test_size=0.2, random_state = 42)

Feature scaling

In [None]:
mean_train_fe2 = np.ones(X_train_lin_reg_fe2.shape[1])
std_train_fe2 = np.ones(X_train_lin_reg_fe2.shape[1])
for i in range(0, X_train_lin_reg_fe2.shape[1]):
    mean_train_fe2[i] = np.mean(X_train_lin_reg_fe2.transpose()[i])
    std_train_fe2[i] = np.std(X_train_lin_reg_fe2.transpose()[i])
    for j in range(0, X_train_lin_reg_fe2.shape[0]):
        X_train_lin_reg_fe2[j][i] = (X_train_lin_reg_fe2[j][i] - mean_train_fe2[i])/std_train_fe2[i]
        
mean_test_fe2 = np.ones(X_test_lin_reg_fe2.shape[1])
std_test = np.ones(X_test_lin_reg_fe2.shape[1])
for i in range(0, X_test_lin_reg_fe2.shape[1]):
    mean_test[i] = np.mean(X_test_lin_reg_fe2.transpose()[i])
    std_test[i] = np.std(X_test_lin_reg_fe2.transpose()[i])
    for j in range(0, X_test_lin_reg_fe2.shape[0]):
        X_test_lin_reg_fe2[j][i] = (X_test_lin_reg_fe2[j][i] - mean_test[i])/std_test[i]

Calling the principal function with learning_rate = 0.01

In [None]:
num_iters = 1000
theta_reg_fe2, cost_reg_fe2 = linear_regression(X_train_lin_reg_fe2, y_train_lin_reg_fe2, 0.01, num_iters)

In [None]:
import matplotlib.pyplot as plt
cost_reg_fe2 = list(cost_reg_fe2)
n_iterations = [x for x in range(1,num_iters+1)]
plt.plot(n_iterations, cost_reg_fe2)
plt.xlabel('No. of iterations')
plt.ylabel('Cost')

Predictions:

In [None]:
X_test_lin_reg_fe2 = np.concatenate((np.ones((X_test_lin_reg_fe2.shape[0],1)), X_test_lin_reg_fe2) ,axis = 1)
predictions_lin_reg_fe2 = hypothesis(theta_reg_fe2, X_test_lin_reg_fe2, X_test_lin_reg_fe2.shape[1] - 1)

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test, predictions_lin_reg_fe2)))

In [None]:
plt.plot(y_test.ravel(), '-', predictions_lin_reg_fe2, '-')
plt.title('Linear regression')
plt.xlabel('Number of Test Samples') 
plt.ylabel('Chance of Admission')

In [None]:
output_error_fe2 = y_test - predictions_lin_reg_fe2

error_lin_reg_fe2 = np.mean(np.abs(output_error_fe2))
accuracy_lin_reg_fe2 = (1 - error_lin_reg_fe2) * 100

print("Test Accuracy " + str(round(accuracy_lin_reg_fe2,2)) + "%")