# Objective : Student Grant Recommendation

We have historical student performance data and their grant recommendation outcomes in the form of a comma separated value file named student_records.csv. Each data sample consists of the following attributes.

• Name (the student name)
• OverallGrade (overall grade obtained)
• Obedient (whether they were diligent during their course of stay)
• ResearchScore (marks obtained in their research work)
• ProjectScore (marks obtained in the project)
• Recommend (whether they got the grant recommendation)

The main objective is to build a predictive model based on this data such that we can predict for any future student whether they will be recommended for the grant based on their performance attributes.

In [None]:
import pandas as pd
#--turn of warning messages
pd.options.mode.chained_assignment = None  # default='warn'

#--get data
df = pd.read_csv('student_records.csv')
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [None]:
#--get features and corresponding outcomes
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

print(training_features)
print("----------------")
print(outcome_labels)

  OverallGrade Obedient  ResearchScore  ProjectScore
0            A        Y             90            85
1            C        N             85            51
2            F        N             10            17
3            B        Y             75            71
4            E        N             20            30
5            A        Y             92            79
6            B        Y             60            59
7            C        Y             75            33
----------------
  Recommend
0       Yes
1       Yes
2        No
3        No
4        No
5       Yes
6        No
7        No


In [None]:
#--list down features based on type
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categoricial_feature_names = ['OverallGrade', 'Obedient']

In [None]:
#--scale or normalize our two numeric score-based attributes
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

# fit scaler on numeric features
ss.fit(training_features[numeric_feature_names])

# scale numeric features now
training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])

# view updated feature-set
print(training_features)

  OverallGrade Obedient  ResearchScore  ProjectScore
0            A        Y       0.899583      1.376650
1            C        N       0.730648     -0.091777
2            F        N      -1.803390     -1.560203
3            B        Y       0.392776      0.772004
4            E        N      -1.465519     -0.998746
5            A        Y       0.967158      1.117516
6            B        Y      -0.114032      0.253735
7            C        Y       0.392776     -0.869179


In [None]:
#--Engineering Categorical Features
training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)

# view newly engineering features
print(training_features)

# We have converted our categoricial data into numeric. 
# or we can say we have done feature engineering over categorical data.

   ResearchScore  ProjectScore  ...  Obedient_N  Obedient_Y
0       0.899583      1.376650  ...           0           1
1       0.730648     -0.091777  ...           1           0
2      -1.803390     -1.560203  ...           1           0
3       0.392776      0.772004  ...           0           1
4      -1.465519     -0.998746  ...           1           0
5       0.967158      1.117516  ...           0           1
6      -0.114032      0.253735  ...           0           1
7       0.392776     -0.869179  ...           0           1

[8 rows x 9 columns]


In [None]:
#--get list of new categorical features
categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))

print(categorical_engineered_features)

['OverallGrade_A', 'OverallGrade_B', 'Obedient_N', 'OverallGrade_F', 'OverallGrade_E', 'OverallGrade_C', 'Obedient_Y']


In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import warnings; warnings.simplefilter('ignore')  

#--fit the model  
lrg = LogisticRegression()
model = lrg.fit(training_features, outcome_labels)
#--view model parameters 
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#--simple evaluation on training data
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

#--evaluate model performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')

print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 100.0 %
Classification Stats:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [None]:
#--Model Deployment  -- optional in our case
from sklearn.externals import joblib
import os
#--save models to be deployed on server
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler') 
    
joblib.dump(model, r'Model/model.pickle') 
joblib.dump(ss, r'Scaler/scaler.pickle')

# Check both the folders under  C:\Program Files\Python36

['Scaler/scaler.pickle']

In [None]:
#--Prediction in Action
#--load model and scaler objects
model = joblib.load(r'Model/model.pickle')
scaler = joblib.load(r'Scaler/scaler.pickle')

# We have some sample new student records (for two students) 
# for which we want our model to predict if they will get the 
# grant recommendation. 
# Let’s retrieve and view this data using the following code.

#--data retrieval
new_data = pd.DataFrame([{'Name': 'Ninad', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},
                  {'Name': 'Thomas', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])

print(new_data)

     Name OverallGrade Obedient  ResearchScore  ProjectScore
0   Ninad            F        N             30            20
1  Thomas            A        Y             78            80


In [None]:
# w.r.t new data
# We will now carry out the tasks relevant to 
# data preparation—feature extraction, engineering, and scaling 
# in the following code snippet.

#--data preparation   
new_training_features = new_data[feature_names]

#--scaling ..
scaler.fit(new_training_features[numeric_feature_names])
new_training_features[numeric_feature_names] = scaler.transform(new_training_features[numeric_feature_names])

#--engineering categorical variables .. 
new_training_features = pd.get_dummies(new_training_features, columns=categoricial_feature_names)


new_training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.0,-1.0,0,1,1,0
1,1.0,1.0,1,0,0,1


In [None]:
# add missing categorical feature columns
current_categorical_engineered_features = set(new_training_features.columns) - set(numeric_feature_names)

missing_features = set(categorical_engineered_features) - current_categorical_engineered_features

for feature in missing_features:
    # add zeros since feature is absent in these data samples
    new_training_features[feature] = [0] * len(new_training_features)
    

# view final feature set
print(new_training_features)

   ResearchScore  ProjectScore  ...  OverallGrade_E  OverallGrade_B
0           -1.0          -1.0  ...               0               0
1            1.0           1.0  ...               0               0

[2 rows x 9 columns]


In [None]:
# We have our complete feature set ready for both the new students. 
# Let’s put our model to the test and get the predictions 
# with regard to grant recommendations!

pred_labels1 = model.predict(new_training_features)
new_data['Recommend'] = pred_labels1
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Ninad,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes
