In [34]:
# data processing
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")

In [2]:
data = pd.read_csv('remove_outliers.csv')
data

Unnamed: 0.1,Unnamed: 0,Course,Pr1,Pr2,Pr3,Pr4,Pr5,Oapr,Course_Label
0,5,BS AGRI,52.90,77.12,29.96,75.28,53.29,61.37,12
1,6,BS AGRI,68.50,54.75,36.55,70.61,43.49,62.48,12
2,7,BS AGRI,74.80,66.85,89.19,93.38,72.90,84.19,12
3,8,BS AGRI,58.56,66.85,72.66,79.96,33.69,66.95,12
4,9,BS AGRI,58.56,87.40,80.93,34.54,9.69,62.48,12
...,...,...,...,...,...,...,...,...,...
1865,2400,BSNE,76.83,35.70,93.41,79.96,79.78,80.63,38
1866,2402,BSNE,84.91,71.99,72.66,24.91,63.10,76.98,38
1867,2403,BSNE,83.47,66.85,85.06,70.61,83.75,84.19,38
1868,2404,BSNE,52.90,54.75,89.19,75.28,87.72,73.34,38


In [3]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
data.dtypes

Course           object
Pr1             float64
Pr2             float64
Pr3             float64
Pr4             float64
Pr5             float64
Oapr            float64
Course_Label      int64
dtype: object

In [5]:
data.head()

Unnamed: 0,Course,Pr1,Pr2,Pr3,Pr4,Pr5,Oapr,Course_Label
0,BS AGRI,52.9,77.12,29.96,75.28,53.29,61.37,12
1,BS AGRI,68.5,54.75,36.55,70.61,43.49,62.48,12
2,BS AGRI,74.8,66.85,89.19,93.38,72.9,84.19,12
3,BS AGRI,58.56,66.85,72.66,79.96,33.69,66.95,12
4,BS AGRI,58.56,87.4,80.93,34.54,9.69,62.48,12


# Exploratory Data Analysis

### FEATURE SELECTION

In [6]:
# X TARGET
FEATURES = [
     'Pr1',
     'Pr2',
     'Pr3',
     'Pr4',
     'Pr5',
     'Oapr'
]

In [7]:
# Y TARGET
TARGET = 'Course'

In [8]:
X = data[FEATURES]
Y = data[TARGET]

In [9]:
Y.value_counts()

BSN           271
BS BIO        121
BEED          112
AB POLSCI      94
BS CE          91
BSIT           77
BSCS           72
BS ME          70
BPE            64
BSED           62
AB ENG         62
BSSW           60
BAHISTORY      54
BS EE          43
AB FIL         39
BS CD          38
BSNE           35
BS ACCTNG      35
BS ND          33
BSHM           33
ABMC-BROAD     32
BS HE          30
BS COE         29
BS CHEM        29
BS SE          28
BSIE           28
BS MATH        27
AB JOURN       25
BS ECE         23
BS ENE         23
BS GE          22
BS STAT        18
BS AGRI        18
ABMC-JOURN     13
BSED SS        13
BSED VEDUC     12
BS ABE         12
BS PHYSICS     11
BS F            7
BPED            4
Name: Course, dtype: int64

In [10]:
Y=Y.astype("string")

In [11]:
normalized_X = Normalizer().fit_transform(X)
print(normalized_X.shape)

(1870, 6)


In [12]:
normalized_X

array([[0.35732898, 0.52093026, 0.20237384, 0.50850142, 0.35996335,
        0.41454214],
       [0.48672697, 0.3890263 , 0.25970614, 0.50171958, 0.30901834,
        0.44395184],
       [0.37808164, 0.33789783, 0.45081687, 0.4719955 , 0.36847796,
        0.42554403],
       ...,
       [0.42955548, 0.34402521, 0.43773798, 0.36337501, 0.43099642,
        0.43326076],
       [0.29347496, 0.30373826, 0.49480211, 0.41763318, 0.48664695,
        0.40687058],
       [0.50746705, 0.45026685, 0.36569813, 0.4120604 , 0.14264471,
        0.46460795]])

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(1496, 6) (374, 6) (1496,) (374,)


# CLASSIFIER ALGORITHM

### RANDOM FOREST

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
rfc = RandomForestClassifier(random_state = 0, n_estimators = 200)
rfc.fit(X_train, Y_train)

In [16]:
pred_rfc = rfc.predict(X_test)

In [17]:
#K FOLD FOR IT
cv = KFold(n_splits=2, random_state=0, shuffle=True)
# evaluate model
rfc_auc = cross_val_score(rfc, X_train, Y_train, scoring='roc_auc_ovo', cv=cv, n_jobs=-1)
rfc_accuracy = cross_val_score(rfc, X_train, Y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [19]:
pd.DataFrame([[accuracy_score(Y_test, pred_rfc), '-'],[np.mean(rfc_accuracy), np.mean(rfc_auc)]], columns = ['Accuracy', 'AUC'], index = ['Prediction', 'K-fold'])

Unnamed: 0,Accuracy,AUC
Prediction,0.149733,-
K-fold,0.122326,0.600951


### NAIVE BAYES

In [20]:
from sklearn.naive_bayes import GaussianNB

In [21]:
naive = GaussianNB()
naive.fit(X_train, Y_train)

In [22]:
pred_naive = naive.predict(X_test)

In [23]:
#KFOLD 
cv = KFold(n_splits=2, random_state=0, shuffle=True)
#evaluate model
naive_auc = cross_val_score(naive, X_train, Y_train, scoring='roc_auc_ovo', cv=cv, n_jobs=-1)
naive_accuracy = cross_val_score(naive, X_train, Y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [24]:
#WITH FIRST AND SECOND TRY GRADES
pd.DataFrame([[accuracy_score(Y_test, pred_naive), '-'],[np.mean(naive_accuracy), np.mean(naive_auc)]], columns = ['Accuracy', 'AUC'], index = ['Prediction', 'K-fold'])

Unnamed: 0,Accuracy,AUC
Prediction,0.114973,-
K-fold,0.117647,0.645419


# METRICS

In [25]:
from sklearn.metrics import confusion_matrix, classification_report

In [26]:
model_pred = {'Random Forest Classifier' : pred_rfc, 'Naive Bayes' : pred_naive}

In [27]:
for v,k in model_pred.items():
    print(f'{v} :\n{confusion_matrix(Y_test, model_pred[v])}\n\n\n')

Random Forest Classifier :
[[ 1  0  0 ...  1  1  0]
 [ 0  0  1 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 3  0  0 ... 22  0  0]
 [ 0  0  0 ...  1  0  0]
 [ 0  0  1 ...  3  1  1]]



Naive Bayes :
[[ 0  3  0 ...  1  0  1]
 [ 1  0  0 ...  1  0  0]
 [ 0  0  0 ...  1  0  0]
 ...
 [ 1  0  0 ... 18  0  0]
 [ 1  0  0 ...  1  0  0]
 [ 0  0  0 ...  1  0  0]]





### Classification Report

In [28]:
for v,k in model_pred.items():
    print(f'{v} :\n{classification_report(Y_test, model_pred[v])}\n\n\n')

Random Forest Classifier :
              precision    recall  f1-score   support

      AB ENG       0.08      0.09      0.09        11
      AB FIL       0.00      0.00      0.00         5
    AB JOURN       0.00      0.00      0.00         4
   AB POLSCI       0.11      0.09      0.10        22
  ABMC-BROAD       0.00      0.00      0.00         8
  ABMC-JOURN       0.00      0.00      0.00         3
   BAHISTORY       0.33      0.31      0.32        13
        BEED       0.20      0.33      0.25        24
         BPE       0.12      0.13      0.13        15
      BS ABE       0.00      0.00      0.00         1
   BS ACCTNG       0.00      0.00      0.00         6
     BS AGRI       0.50      0.14      0.22         7
      BS BIO       0.00      0.00      0.00        23
       BS CD       0.00      0.00      0.00         7
       BS CE       0.33      0.22      0.27        18
     BS CHEM       0.00      0.00      0.00         3
      BS COE       0.00      0.00      0.00         8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### TRAINED MODEL

In [38]:
import pickle
from sklearn.metrics import confusion_matrix, classification_report

In [39]:
#RANDOM FOREST
pickle.dump(rfc, open('TrainedModel_Forest.pkl', 'wb'))

In [40]:
#NAIVE BAYES
pickle.dump(naive, open('TrainedModel_Naive.pkl', 'wb'))

### BATCH PREDICTION

In [41]:
test_model_forest = pickle.load(open('TrainedModel_Forest.pkl', 'rb'))

In [42]:
test_model_naive = pickle.load(open('TrainedModel_Naive.pkl', 'rb'))

In [62]:
# RANDOM FOREST RANGING FROM 60% TO 100% ACCURACY
new_Xdata = X.sample(5)
new_Ydata = Y[new_Xdata.index.values]
pred = test_model_forest.predict(new_Xdata)

print(new_Xdata,'\n\n', new_Ydata)
print('\n\n', pred)

res_cm = confusion_matrix(new_Ydata, pred)
print('\n\nConfusion Matrix:\n\n', res_cm)
res_acc = accuracy_score(new_Ydata, pred)
print('\n\nAccuracy Score:', res_acc)

        Pr1    Pr2    Pr3    Pr4    Pr5   Oapr
119   70.75  77.12  36.55  79.96  63.10  72.43
456   49.95  71.99  72.66  79.96  33.69  64.72
1153  96.08  99.70  97.72  84.63  91.68  97.46
1715  80.58  35.70  36.55  70.61  53.29  66.95
1737  87.60  96.76  85.06  92.22  63.10  91.56 

 119          BSIT
456     AB POLSCI
1153        BS ME
1715         BEED
1737         BEED
Name: Course, dtype: string


 ['BSIT' 'AB POLSCI' 'BS ME' 'BEED' 'BEED']


Confusion Matrix:

 [[1 0 0 0]
 [0 2 0 0]
 [0 0 1 0]
 [0 0 0 1]]


Accuracy Score: 1.0


In [65]:
# RANDOM FOREST RANGING
naive_Xdata = X.sample(5)
naive_Ydata = Y[naive_Xdata.index.values]
naive_pred = test_model_naive.predict(naive_Xdata)

print(naive_Xdata,'\n\n', naive_Ydata)
print('\n\n', pred)

naive_res_cm = confusion_matrix(naive_Ydata, naive_pred)
print('\n\nConfusion Matrix:\n\n', res_cm)
naive_res_acc = accuracy_score(naive_Ydata, naive_pred)
print('\n\nAccuracy Score:', naive_res_acc)

        Pr1    Pr2    Pr3    Pr4    Pr5   Oapr
614   89.71  77.12  85.06  84.63  79.78  89.65
1291  47.00  35.70  72.66  79.96  63.10  60.09
475   80.58  90.94  94.84  84.63  83.75  90.13
1044  68.50  90.94  29.96  70.61  79.78  73.34
360   68.50  61.11  76.79  63.45  33.69  68.07 

 614     BS ACCTNG
1291          BPE
475     AB POLSCI
1044       BS ECE
360        AB FIL
Name: Course, dtype: string


 ['BSIT' 'AB POLSCI' 'BS ME' 'BEED' 'BEED']


Confusion Matrix:

 [[1 0 0 0]
 [0 2 0 0]
 [0 0 1 0]
 [0 0 0 1]]


Accuracy Score: 0.0
