In [10]:
csv_filepath = 'admissions_data.csv'

In [11]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from bokeh.resources import CDN
from bokeh.embed import components
from bokeh.models import Legend
from bokeh.plotting import *
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm

In [12]:
def get_corrected_uni_name(college_name):
    column_names = ['uniNames', 'majors','degrees', 'seasons', 'decisions', 'gpa', 'verbal_scores', 'quant_scores', 'awa_scores','toefl_scores']
    data = pd.read_csv(csv_filepath, names = column_names, header = None)

    if college_name is not None:
        unis = data.uniNames

        all_unis = list()

        for uni in unis:
            if uni not in all_unis:
                all_unis.append(uni)

        print(len(all_unis))

        current_uni = process.extractOne(college_name, all_unis)

        if current_uni[1] < 80:
            college_name = None
        else:
            college_name = current_uni[0]
    return college_name

In [13]:
def get_student_example(user_info):
    intended_major = user_info['major']
    student_gpa = float(user_info['gpa'])
    student_verbal_score = float(user_info['verbal_score'])
    student_quant_score = float(user_info['quant_score'])
    student_awa_score = float(user_info['awa_score'])
    toefl_score = float(user_info['toefl_score'])

    # normalizing the scores 
    student_gpa_normalized = (student_gpa - 0.25) / (4 - 0.25)
    student_verbal_score_normalized = (student_verbal_score - 130.0) / (170.0 - 130.0)
    student_quant_score_normalized = (student_quant_score - 131.0) / (170.0 - 131.0)
    student_awa_score_normalized = (student_awa_score - 0.30) / (6.0 - 0.30)
    toefl_score_normalized = (toefl_score - 57.0) / (120.0 - 57.0)

    example = np.array([student_gpa_normalized, student_verbal_score_normalized,
                            student_quant_score_normalized, student_awa_score_normalized, toefl_score_normalized])
    example = example.reshape(1, -1)

    return example

In [14]:
def prediction_using_knn(query, user_info):
    intended_major = user_info['major']
    student_gpa = float(user_info['gpa'])
    student_verbal_score = float(user_info['verbal_score'])
    student_quant_score = float(user_info['quant_score'])
    student_awa_score = float(user_info['awa_score'])
    toefl_score = float(user_info['toefl_score'])

    # normalizing the scores 
    student_gpa_normalized = (student_gpa - 0.25) / (4 - 0.25)
    student_verbal_score_normalized = (student_verbal_score - 130.0) / (170.0 - 130.0)
    student_quant_score_normalized = (student_quant_score - 131.0) / (170.0 - 131.0)
    student_awa_score_normalized = (student_awa_score - 0.30) / (6.0 - 0.30)
    toefl_score_normalized = (toefl_score - 57.0) / (120.0 - 57.0)

    names = ['University', 'Major', 'Degree', 'Season', 'class', 'GPA',
             'Verbal', 'Quant', 'AWA', 'TOEFL']

    db = pd.read_csv(csv_filepath, names=names, header=None)
    db.fillna(0, inplace=True)

    db1 = db[(db['Major'] == intended_major) & (db['University'] == query)]
    db11 = db1
    db12 = db1
    db13 = db1
    db14 = db1
    db15 = db1


    frames1 = [db1, db11, db12, db13, db14, db15]
    db1 = pd.concat(frames1)


    if len(db1) > 25:
        # ---create design matrix X and target vector y---
        x1 = np.array(db1.iloc[:, 5:10])  # end index is exclusive
        y1 = np.array(db1['class'])  # another way of indexing a pandas df
        z2 = np.array(db1.iloc[:, 0:5])
        z11 = z2[:, np.array([True, True, False, False, True])]

        # --split into train and test--
        x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.20, random_state=4)

        # --instantiate learning model (k = 15)--
        knn_for_admission = KNeighborsClassifier(n_neighbors=15)
        scores = cross_val_score(knn_for_admission, x1, y1, cv=25, scoring='accuracy')
        cross_validated_accuracy = scores.mean()
        print ('--------------------------------------------------')
        print ('--Cross Validated Accuracy_KNN', cross_validated_accuracy)
        print ('--------------------------------------------------')

        # --fitting the model--
        knn_for_admission.fit(x1_train, y1_train)

        example = np.array([student_gpa_normalized, student_verbal_score_normalized,
                            student_quant_score_normalized, student_awa_score_normalized, toefl_score_normalized])
        example = example.reshape(1, -1)
        # ----Predicting the admission by model----
        admission_prediction = knn_for_admission.predict(example)
        print ('--------------------------------------------------')

        print ("Admission_Prediction_KNN: ", admission_prediction)
        print ('--------------------------------------------------')
        admission_decision_accuracy = float("%.2f" % (cross_validated_accuracy*100))

        if admission_prediction == "Accepted":
            accepted_percent = admission_decision_accuracy
            rejected_percent = 100 - admission_decision_accuracy
        else:
            rejected_percent = admission_decision_accuracy

            accepted_percent = 100 - admission_decision_accuracy

        return [admission_prediction, admission_decision_accuracy]

In [15]:
def prediction_using_random_forest(query, intended_major, example):
    names = ['University', 'Major', 'Degree', 'Season', 'class', 'GPA',
             'Verbal', 'Quant', 'AWA', 'TOEFL']

    db = pd.read_csv(csv_filepath, names=names, header=None)
    db.fillna(0, inplace=True)
    db1 = db[(db['Major'] == intended_major) & (db['University'] == query)]
    db11 = db1
    db12 = db1
    db13 = db1
    db14 = db1
    db15 = db1

    frames1 = [db1, db11, db12, db13, db14, db15]
    db1 = pd.concat(frames1)

    db1['GPA'] = db1.GPA.astype(float)
    db1['Verbal'] = db1.Verbal.astype(float)
    db1['Quant'] = db1.Quant.astype(float)
    db1['AWA'] = db1.AWA.astype(float)
    db1['TOEFL'] = db1.TOEFL.astype(float)
    y = np.array(db1.loc[:, ['class']])
    dataframe = db1.loc[:, 'GPA':'TOEFL']

    X = np.array(dataframe)

    # spliting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    # Create a random forest classifier. By convention, clf means 'classifier'
    clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                                 max_depth=None, max_features='auto', max_leaf_nodes=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2,
                                 oob_score=False, random_state=None, verbose=0,
                                 warm_start=False)

    # Train the classifier to take the training features and learn how they relate
    # to the training y (the species)
    clf.fit(X_train, y_train)

    prediction = clf.predict(X_test)
    # print (prediction)

    decision_accuracy = clf.predict_proba(X_test)
    # print(decision_accuracy)

    admission_prediction = clf.predict(example)
    print ('--------------------------------------------------')
    print('Prediction_with_random_forest', admission_prediction)
    print ('--------------------------------------------------')

    prediction = clf.predict_proba(example)
    print ('Accuracy_random_forest', prediction)
    print ('--------------------------------------------------')

    admission_list = prediction.tolist()
    top_accepted = (admission_list[0])[0]
    top_rejected = (admission_list[0])[1]

    if top_accepted >= 0.9:
        accepted_percent = 88
        rejected_percent = 12
    elif top_rejected >= 0.9:
        accepted_percent = 12
        rejected_percent = 88
    else:
        accepted_percent = top_accepted * 100
        rejected_percent = 100 - top_accepted*100

    print('accepted percentage:',accepted_percent)

    return [admission_prediction, accepted_percent]

In [16]:
def prediction_using_SVM(csv_filepathname, query, intended_major, example):

    names = ['University', 'Major', 'Degree', 'Season', 'class', 'GPA',
             'Verbal', 'Quant', 'AWA', 'TOEFL']

    df = pd.read_csv(csv_filepathname, names=names, header=None)

    students_major = intended_major
    university_searched = query

    dataframe = df[df['Major'] == students_major]
    dataframe = dataframe[dataframe['University'] == university_searched]
    db11 = dataframe
    db12 = dataframe
    db13 = dataframe
    db14 = dataframe
    db15 = dataframe

    frames1 = [dataframe, db11, db12, db13, db14, db15]
    dataframe = pd.concat(frames1)

    # dataframe['class'] = [1.0 if x == 'Accepted' else -1.0 for x in dataframe['class']]

    # create design matrix X and target vector y

    dataframe['GPA'] = dataframe.GPA.astype(float)
    dataframe['Verbal'] = dataframe.Verbal.astype(float)
    dataframe['Quant'] = dataframe.Quant.astype(float)
    dataframe['AWA'] = dataframe.AWA.astype(float)
    dataframe['TOEFL'] = dataframe.TOEFL.astype(float)
    y = np.array(dataframe.loc[:, ['class']])
    dataframe = dataframe.loc[:, 'GPA':'TOEFL']

    X = np.array(dataframe)

    # spliting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    # instantiate learning model
    clf = svm.SVC(kernel='linear', C=0.001)

    clf.fit(X_train, y_train)

    # print ('actual', (y_test))

    # predict
    prediction = clf.predict(X_test)
    # print ('predicted', prediction)
    print ('--------------------------------------------------')

    decision_accuracy = clf.score(X_test, y_test)
    print('Accuracy_with_SVM', decision_accuracy)
    print ('--------------------------------------------------')

    admission_prediction = clf.predict(example)
    print('Prediction_with_SVM', admission_prediction)
    print ('--------------------------------------------------')

    # contengency matrix
    mat = metrics.confusion_matrix(y_test, prediction)
    print(mat)
    tp = mat[0, 0]
    fp = mat[1, 0]
    fn = mat[0, 1]
    tn = mat[1, 1]
    accuracy = float(tp + tn)/( tn + tp + fn + fp)
    print(accuracy)
    print ('--------------------------------------------------')

    if admission_prediction == "Accepted":
        accepted_percent = decision_accuracy*100
        rejected_percent = 100 - (decision_accuracy*100)
    else:
        rejected_percent = decision_accuracy*100

        accepted_percent = 100 - (decision_accuracy*100)
    
    return [admission_prediction, decision_accuracy]

In [17]:
def get_admissions_prediction(college_name, student_info):
    student_details_example = get_student_example(student_info)
    knn_prediction = prediction_using_knn(college_name, student_info)
    rf_prediction = prediction_using_random_forest(college_name, student_info['major'], student_details_example)
    svm_prediction = prediction_using_SVM(csv_filepath,college_name,student_info['major'],student_details_example)
    return [knn_prediction, rf_prediction, svm_prediction]

In [18]:
student_info = {
    'major': 'Computer Science',
    'gpa': '3.9',
    'verbal_score': '170',
    'quant_score': '170',
    'awa_score': '5.5',
    'toefl_score': '115'
}
college_name = get_corrected_uni_name('new york university')
print('corrected uni name', college_name)


results = get_admissions_prediction(college_name ,student_info)

print('these are the final results')
for result in results:
    print('prediction', result[0])
    print('accuracy', result[1])
    print('--------------------')

721
corrected uni name New York University (NYU)
--------------------------------------------------
--Cross Validated Accuracy_KNN 0.754
--------------------------------------------------
--------------------------------------------------
Admission_Prediction_KNN:  ['Rejected']
--------------------------------------------------
  clf.fit(X_train, y_train)
--------------------------------------------------
Prediction_with_random_forest ['Rejected']
--------------------------------------------------
Accuracy_random_forest [[0.484 0.516]]
--------------------------------------------------
accepted percentage: 48.4
--------------------------------------------------
Accuracy_with_SVM 0.6084788029925187
--------------------------------------------------
Prediction_with_SVM ['Accepted']
--------------------------------------------------
[[244   0]
 [157   0]]
0.6084788029925187
--------------------------------------------------
these are the final results
prediction ['Rejected']
accuracy 75.4