## Student Intervention System

In [2]:
# Import Libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score,precision_score,recall_score

#load student data
student_data = pd.read_csv('dataset/student-data.csv')
target = student_data['passed']
student_data.drop('passed',1,inplace=True)

# Data Exploration

In [3]:
n_students,n_features = student_data.shape
n_pass = len(target[target=='yes'])
n_fail = n_students - n_pass
graduation = (n_pass/n_students) * 100

print("Total Number of students : {}".format(n_students))
print("Number of features : {}".format(n_features))
print("Number of passed students : {}".format(n_pass))
print("Number of failed students : {}".format(n_fail))
print("Students percentage who got graduation : {:.2f}%".format(graduation))


Total Number of students : 395
Number of features : 30
Number of passed students : 265
Number of failed students : 130
Students percentage who got graduation : 67.09%


# Preprocess features

You can see that, there are several non-numerical values. Since our model understands only numbers,we need to convert everything to numbers. Columns like 'higher','internet' which has values of 'yes'/'no' can be converted to 1/0. And other columns like 'sex','school' which has categorical data can be converted to dummies column using pd.getDummies method.

In [4]:
def preprocess_features(X):
    preprocessed_col = pd.DataFrame(index = X.index)
    for col,col_data in X.iteritems():
        if col_data.dtype == object:
            col_data = col_data.replace(['yes','no'],[1,0])
        if col_data.dtype == object :
            col_data = pd.get_dummies(col_data,prefix = col)
        preprocessed_col = preprocessed_col.join(col_data)
    return preprocessed_col

In [5]:
X = preprocess_features(student_data)
y = target.replace(['yes','no'],[1,0])


# Spliting the Dataset

so far, we have converted binary and categorical columns to numerical columns. Now we have to prepare training and testing set on the given Dataset. since this would be useful to test our model because testing the unknown dataset with label will be helpful.

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
classifiers = [
  
    { 'name' : "GaussianNB", 'clf' : GaussianNB() },
    { 'name' : "Decision Tree", 'clf' : DecisionTreeClassifier(random_state=42) },
    { 'name' : "Logistic Regression", 'clf' : LogisticRegression(random_state=42,solver='lbfgs',max_iter=1000) },
      { 'name' : "SVM", 'clf' : SVC(random_state=42,gamma='auto') }
    
]


In [11]:
from sklearn.metrics import accuracy_score
from time import time

model_list = []
count = -1
for classifier in classifiers:
    for train_size in [200,300]:
        
        count += 1        
        X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = train_size,random_state=42, stratify = y)
        
        # Setting up a model
        clf = classifier['clf']
        model = {}  
        model['Name'] = classifier['name']
        
        start = time()
        clf.fit(X_train,y_train)
        end = time()
        
        model['Time(training)'] = round(end-start,4)
            

        
        # Prediction  
       
        predictions_train = clf.predict(X_train)
        start = time()
        predictions_test = clf.predict(X_test)
        end = time()
        
        # Time calculation
        model['Time(prediction)'] = round(end-start,4)
        
        # Accuracy score - comparing the predicted output to the Actual output
        score_train = accuracy_score(y_train,predictions_train)
        score_test = accuracy_score(y_test,predictions_test)
        model['Score(train)'] = score_train
        model['Score(test)'] = score_test
                
        # Sample size
        model['Train_size'] = len(X_train)
        model['Test_size'] = len(X_test)   
        
        #F1 Score 
        model['F1_score(train)'] = f1_score(y_train,predictions_train,pos_label=1)
        model['F1_score(test)'] = f1_score(y_test,predictions_test,pos_label=1)
        model_list.append(model)


In [12]:
df = pd.DataFrame(model_list)
df.columns

Index(['F1_score(test)', 'F1_score(train)', 'Name', 'Score(test)',
       'Score(train)', 'Test_size', 'Time(prediction)', 'Time(training)',
       'Train_size'],
      dtype='object')

In [13]:
# mapping the columns order
cols = ['Name','Train_size','Test_size','Time(training)','Time(prediction)','Score(train)','Score(test)','F1_score(train)','F1_score(test)']
df = df[cols]


In [14]:
df[:2]

Unnamed: 0,Name,Train_size,Test_size,Time(training),Time(prediction),Score(train),Score(test),F1_score(train),F1_score(test)
0,GaussianNB,200,195,0.0071,0.0039,0.745,0.641026,0.814545,0.740741
1,GaussianNB,300,95,0.004,0.0,0.74,0.684211,0.813397,0.776119


In [15]:
df[2:4]

Unnamed: 0,Name,Train_size,Test_size,Time(training),Time(prediction),Score(train),Score(test),F1_score(train),F1_score(test)
2,Decision Tree,200,195,0.005,0.004,1.0,0.646154,1.0,0.743494
3,Decision Tree,300,95,0.004,0.004,1.0,0.610526,1.0,0.683761


In [16]:
df[4:6]

Unnamed: 0,Name,Train_size,Test_size,Time(training),Time(prediction),Score(train),Score(test),F1_score(train),F1_score(test)
4,Logistic Regression,200,195,0.1508,0.0018,0.775,0.666667,0.844291,0.768683
5,Logistic Regression,300,95,0.1059,0.004,0.776667,0.642105,0.845266,0.746269


In [17]:
df[6:8]

Unnamed: 0,Name,Train_size,Test_size,Time(training),Time(prediction),Score(train),Score(test),F1_score(train),F1_score(test)
6,SVM,200,195,0.0091,0.0061,0.785,0.712821,0.861736,0.821656
7,SVM,300,95,0.0131,0.004,0.793333,0.684211,0.866379,0.805195


# Choosing the best model

Decision tree performs very worst compared to the other model and we can see that, it is overfitted.

Naive Bayes performs better than Logistic and equal to SVM. Its computational time is better than the other model and our costs
would not increase when we have more data. Since it doesnt have hyperparameter to tune to increase the performance. So this 
wont help us in increasing the performance.

Logistic Regression performs better than Decision tree but slightly worst than SVM.

SVM performs better than other model but equal to NaiveBayes. Its computational time would cost us more when we have more data.

Between Naive Bayes and SVM, we will go with svm since it has hyperparameter to tune to increase the performance of the model.

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedShuffleSplit

In [64]:
parameters = {'C':[0.01,1,3,5],'kernel':['linear','poly','rbf',],'degree':range(1,6)}

clf = SVC(random_state=42,gamma = 'auto')

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score, pos_label=1)

grid_obj = GridSearchCV(clf, parameters,cv=5 , scoring=f1_scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# best estimator
clf = grid_obj.best_estimator_

clf

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

# Performing on the tuned model

In [65]:
predictions_test = clf.predict(X_test) ;
print("Accuracy score : {}" .format(accuracy_score(y_test,predictions_test)))

Accuracy score : 0.6947368421052632
