# ***Classification Problem***

The original dataset is from the Kaggle. Please download the dataset from the link below.

https://www.kaggle.com/aljarah/xAPI-Edu-Data

# Importing the Libraries

In [None]:
#importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sc
import numpy as np
#Libraries for importing functions for evaluating the performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
# importing the libraries for grid search
from sklearn.model_selection import GridSearchCV

# Importing the Dataset

In [None]:
dataset = pd.read_csv("edu.csv")

# Exploring the Dataset

In [None]:
#printing the first fice rows of the dataset
dataset.head()

In [None]:
#printing the details about the dataset - Column names and number of rows
dataset.info()

As you can see, we have 480 rows and 17 columns.
Unfortunately, its a mixture of both numerical and categorical values :(

In [None]:
#Checking the dataset if it contains any null values in the columns
dataset_null_values_info = pd.DataFrame(dataset.isnull().sum().sort_values(ascending=False))
dataset_null_values_info.columns = ['Count of the Null Values']
dataset_null_values_info.index.name = 'Column Name'
print(dataset_null_values_info)

Great! We don't have any missing values. The dataset is clean.

Target Variable is Class 

Rest all others are driver variables (16 columns)

In [None]:
# Checking the values in each categorical columns:
categorical_columns = dataset.select_dtypes(exclude=['int64']).columns
categorical_columns = dataset[categorical_columns]
for temp in categorical_columns.columns:
    print ("******%s******"%temp)
    print (categorical_columns[temp].value_counts())
    print('\n')


Please check how the categorical columns are divided

In [None]:
# Showing summary stats for the numerical features
dataset.describe()

In [None]:
dataset['Class'].value_counts()

As you can see above, we have 3 categories in the target variable. 

Excited to see the distribution in the form of visualization. Check the code below!

# Data Visualization

In [None]:
# Distribution of the categories in the target variable
sns.barplot(dataset['Class'].unique(),dataset['Class'].value_counts())
plt.xlabel("Class Categories")
plt.ylabel("Number of Records")
plt.title("Distribution of Class")

It seems that most of the values in the dataset belongs to category Medium - 'M'. 

The "Low" and "High" category has more or less same amount of Data

The Dataset is balanced and it is a normal distributed class set

Ok! Now lets explore the other categorical columns in the dataset

In [None]:
#Visualizing the data (Topic Vs Class)
fig = plt.figure(figsize=(10,6))
sns.countplot(x='Topic', hue = 'Class', data = dataset)
plt.legend()

In [None]:
#Visualizing the data (Gender Vs Class)
fig = plt.figure(figsize=(10,6))
sns.countplot(x='gender', hue = 'Class', data = dataset)
plt.tight_layout()


In [None]:
#Visualizing the data (PlaceofBirth Vs Class)
fig = plt.figure(figsize=(10,6))
sns.countplot(x='PlaceofBirth', hue = 'Class', data = dataset)
plt.tight_layout()

In [None]:
#Visualizing the data (StageID Vs Class)
fig = plt.figure(figsize=(10,6))
sns.countplot(x='StageID', hue = 'Class', data = dataset)
plt.tight_layout()

In [None]:
#Visualizing the data (GradeID Vs Class)
fig = plt.figure(figsize=(10,6))
sns.countplot(x='GradeID', hue = 'Class', data = dataset)
plt.tight_layout()

In [None]:
#Visualizing the data (Relation Vs Class)
fig = plt.figure(figsize=(10,6))
sns.countplot(x='Relation', hue = 'Class', data = dataset)
plt.tight_layout()

Find the visualization of the correlation matrix for all the numerical variables (Only 4 columns) below to get some insight

In [None]:
# Showing the corrleation matrix - for numerical values - For columns(RaisedHands, VisitedResources, AnnouncementsView and Discussion)
corrmat = dataset.corr() 
f, ax = plt.subplots(figsize =(9, 8)) 
sns.heatmap(corrmat, ax = ax, linewidths = 0.1)

It seems there is a high correlation between the visitedResources and RaisedHands

Ok! Enough of Visualization, lets work on the data now!

# Data Pre-Processing

Updating the target variable from the categorial values to numberical representation

low - 0, medium - 1, high - 2

1) low = "L"

2) medium = "M"

3) high = "H"

In [None]:
low = "L"
medium = "M"
high = "H"
## replace elements in list.
dataset['Class'].replace(to_replace = low , value = 0,inplace = True)
dataset['Class'].replace(to_replace = medium ,value = 1,inplace = True)
dataset['Class'].replace(to_replace = high ,value = 2,inplace = True)
dataset['Class'].value_counts()

Good the target variable column are converted into numerical values

X contains the driver variables

Y contains the target variable

In [None]:
# Assigning the X and Y values
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

Check X and Y values Below

In [None]:
X

In [None]:
Y

Now convert all the categorical values in the driver variables into numerical values

In [None]:
#encoding the categorical data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

#label encoder for categorical variables
for i in range(0, len(dataset.columns) - 1):
    z = [9, 10, 11, 12]
    if i not in z: 
        labelencoder_x = LabelEncoder()
        X[:,i] = labelencoder_x.fit_transform(X[:,i])

Check the values in X now!

In [None]:
X

X now only contains the numerical values

Find the Correlation between the variables. Check the visualization below for more insights!

In [None]:
# Showing the corrleation matrix for all the values in X
columns = list(dataset.columns.values) 
columns.pop()
df = pd.DataFrame(data = X.astype(float), columns = columns)
corrmat = df.corr() 
f, ax = plt.subplots(figsize =(10,10)) 
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1)

PlaceofBirth and Nationality are highly correlated!

There is also a correlation between the columns RaisedHands, Visited Resources and AnnouncementsView.

Now find the correlation between each variable in the driver variables and the target variable using Pearsons Correlation

In [None]:
#To find the correlation between the driver variables and the target variable
temp=pd.DataFrame(columns=['Names','CorrValue'])
for i in range(0,16):
    corr, _ = sc.pearsonr(X[:,i],Y)
    temp=temp.append({"Names":dataset.columns[i], 
                    "CorrValue":abs(corr)}, ignore_index=True)
    print('Pearsons correlation: ',dataset.columns[i],' %.3f' % corr)
    
print(temp.sort_values('CorrValue', ascending=False))

In [None]:
#Visualizing the results for the Pearsons correlation
X_grid = np.arange(0, 16)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.rcParams['figure.figsize'] = (10,5)
slt = plt.bar(temp['Names'], temp['CorrValue'], color = 'green')
plt.xticks(temp['Names'], rotation='vertical')
plt.title('Correlation between the driver variables and the target variable')
plt.xlabel('Column names')
plt.ylabel('Pearsons Correlation')

RaisedHands, VisitedResources and StudentAbsenceDays are the most important columns to predict the outcome "the score" of the student.

Further, apply one hot encoder to the categorial columns where they have more than two categories

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
# One hot encoder for categorical variables - X
onehotencoder_x = OneHotEncoder(categorical_features = [1,2,3,4,5,6])
X = onehotencoder_x.fit_transform(X).toarray() 

In [None]:
X

Appply Feature Scaling

In [None]:
#feature scaling - Only X
from sklearn.preprocessing import StandardScaler
standardscalar_x = StandardScaler()
X = standardscalar_x.fit_transform(X)

In [None]:
X

# Split the dataset

In [None]:
#Splitting the dataset into training set and testing set
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Machine Learning Classification Algorithms

# 1. Logistic Regression 

In [None]:
#apply the Logistic Regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(train_X,train_Y)

Predict the Y value using the classifier

In [None]:
#predict the Y value
pred_Y = classifier.predict(test_X)

Evaluate the model

In [None]:
#Evaluation
#confusion matrix
cm_logistic_regression = confusion_matrix(test_Y, pred_Y)
print(cm_logistic_regression)

Check the confusion matrix above. The diagonal values are the success rate. The counts 22, 31 and 12. Around 65 values are predicted right and 31 values are predicted wrong.

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y, pred_Y))

In [None]:
#Accuracy
print('Accuracy score for Logistic Regression: ', format(accuracy_score(test_Y, pred_Y)))

Now we can try Grid Search to increase the performance of the model

In [None]:
parameters = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_X,train_Y)
grid_predictions = grid_search.predict(test_X)

In [None]:
#confusion matrix for grid search predictions
print(confusion_matrix(test_Y,grid_predictions))

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y,grid_predictions))

In [None]:
print("Best Parameters for Logistic Regression: ", grid_search.best_estimator_)

In [None]:
print("Best Score for Logistic Regression: ", grid_search.best_score_)

# 2. K-NN

In [None]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(train_X,train_Y)

In [None]:
#predict the Y value
pred_Y = classifier.predict(test_X)

In [None]:
#Evaluation
#confusion matrix
cm_knn = confusion_matrix(test_Y, pred_Y)
print(cm_knn)

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y, pred_Y))

In [None]:
#Accuracy
print('Accuracy score for K-NN: ', format(accuracy_score(test_Y, pred_Y)))

# 3. SVM

In [None]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(train_X,train_Y)

In [None]:
#predict the Y value
pred_Y = classifier.predict(test_X)

In [None]:
#Evaluation
#confusion matrix
cm_svm = confusion_matrix(test_Y, pred_Y)
print(cm_svm)

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y, pred_Y))

In [None]:
#Accuracy
print('Accuracy score for SVM: ', format(accuracy_score(test_Y, pred_Y)))

Now we can try Grid Search to increase the performance of the model

In [None]:
# grid Search to increase the performance
# implementing grid search to find the performance for different kernels
parameters = {'kernel':('linear', 'rbf', 'poly','sigmoid')}
clf = GridSearchCV(SVC(), parameters)
clf.fit(train_X,train_Y)
clf.cv_results_

Find the ranking of the kernels above. It seems that sigmoid ranks first. 

Lets try RBF kernel with grid Search

In [None]:
# implementing grid search for the rbf kernel
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]} 
grid_search = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid_search.fit(train_X,train_Y)
grid_predictions = grid_search.predict(test_X)

In [None]:
#confusion matrix
print(confusion_matrix(test_Y,grid_predictions))

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y,grid_predictions))

In [None]:
print("Best Parameters for SVM: ", grid_search.best_estimator_)

In [None]:
print("Best Score for SVM: ", grid_search.best_score_)

Wow! There was a lot of improvement!

# 4. Naive Bayes

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(train_X,train_Y)

In [None]:
#predict the Y value
pred_Y = classifier.predict(test_X)

In [None]:
#Evaluation
#confusion matrix
cm_naive_bayes = confusion_matrix(test_Y, pred_Y)
print(cm_naive_bayes)

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y, pred_Y))

In [None]:
#Accuracy
print('Accuracy score for Naive-Bayes: ', format(accuracy_score(test_Y, pred_Y)))

# 5. Decision Tree Classification

In [None]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(train_X,train_Y)

In [None]:
#predict the Y value
pred_Y = classifier.predict(test_X)

In [None]:
#Evaluation
#confusion matrix
cm_decision_tree = confusion_matrix(test_Y, pred_Y)
print(cm_decision_tree)

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y, pred_Y))

In [None]:
#Accuracy
print('Accuracy score for Decision Tree: ', format(accuracy_score(test_Y, pred_Y)))

Now we can try Grid Search to increase the performance of the model

In [None]:
# grid Search to increase the performance
parameters = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,30,1)), 
              "min_samples_leaf": list(range(5,20,1))}
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters)
grid_search.fit(train_X,train_Y)
grid_predictions = grid_search.predict(test_X)

In [None]:
#confusion matrix
print(confusion_matrix(test_Y,grid_predictions))

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y,grid_predictions))

In [None]:
print("Best Parameters for Decision Tree: ", grid_search.best_estimator_)

In [None]:
print("Best Score for Decision Tree: ", grid_search.best_score_)

# 6. Random Forest Classification

In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(train_X,train_Y)

In [None]:
#predict the Y value
pred_Y = classifier.predict(test_X)

In [None]:
#Evaluation
#confusion matrix
cm_random_forest = confusion_matrix(test_Y, pred_Y)
print(cm_random_forest)

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y, pred_Y))

In [None]:
#Accuracy
print('Accuracy score for Random Forest: ', format(accuracy_score(test_Y, pred_Y)))

Now we can try Grid Search to increase the performance of the model

In [None]:
# grid Search to increase the performance
parameters = {"bootstrap":[True, False], "max_depth": list(range(2,10,1)), 
                  "min_samples_leaf": list(range(5,20,1))}
grid_search = GridSearchCV(RandomForestClassifier(), parameters)
grid_search.fit(train_X,train_Y)
grid_predictions = grid_search.predict(test_X)

In [None]:
#confusion matrix
print(confusion_matrix(test_Y,grid_predictions))

In [None]:
#Precision, recall, F1-score and support
print(classification_report(test_Y,grid_predictions))

In [None]:
print("Best Parameters for Random Forest: ", grid_search.best_estimator_)

In [None]:
print("Best Score for Random Forest: ", grid_search.best_score_)

# TPOT Implementation

TPOT is implemented to find the best model

In [None]:
# Implementation of TPOT to find the best model
from tpot import TPOTClassifier

tpot = TPOTClassifier(verbosity=2, max_time_mins=2)
tpot.fit(train_X, train_Y)
print(tpot.score(test_X, test_Y))

The TPOT score is 80% for Random Forest. 

# Machine Learning Models Conclusion

The results of all the models and the TPOT implementation prove that Random Forest has been the best model!

Please look into the Graphs below for the results

In [None]:
from IPython.display import Image
Image("AccuracyScore.png")

Random Forest Classification Wins the race!

Please find the graph below which predicted the scores after the Grid Search. The best score for the models.

In [None]:
Image("BestScore.png")

In [None]:
Image("AccBestScore.png")

Hence, the results prove that the best model is Random Forest Classification

Lets get into Deep Learning for more Improvement

# Neural Networks

Now its the time for Neural Networks! Get Ready!

Necessary library files for Artificial Neural Networks

In [None]:
# Artificial Neural Network

# Installing Theano
# pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git

# Installing Tensorflow
# Install Tensorflow from the website: https://www.tensorflow.org/versions/r0.12/get_started/os_setup.html

# Installing Keras
# pip install --upgrade keras

In [None]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
Y = pd.get_dummies(Y)

In [None]:
Y

In [None]:
#Splitting the dataset into training set and testing set
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [None]:
# Initialising the ANN
classifier = Sequential()

In [None]:
# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu', input_dim = 66))
classifier.add(Dropout(0.5))

In [None]:
# Adding the second hidden layer
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.5))
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.5))
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.5))
classifier.add(Dense(output_dim = 256, init = 'uniform', activation = 'relu'))
classifier.add(Dropout(0.5))

In [None]:
# Adding the output layer
#classifier.add(Dense(output_dim = 3, init = 'uniform', activation = 'sigmoid'))#85
classifier.add(Dense(output_dim = 3, init = 'uniform', activation = 'hard_sigmoid'))#85

In [None]:
# Compiling the ANN
classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])#88.8

#classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])#88.8

In [None]:
# Fitting the ANN to the Training set
classifier.fit(train_X, train_Y, batch_size = 10, nb_epoch = 100)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(test_X)

In [None]:
results = classifier.evaluate(test_X,test_Y)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(
    test_Y.values.argmax(axis=1), y_pred.argmax(axis=1))
results

The results for the Neural Network:

The accuracy score is in the range between 0.84 and 0.90

Check the results above!

# Conclusion

When we compare all the Machine learning models with the neural network model, we got the best score for neural networks!

In [None]:
Image("FinalScore.png")

The Student Dataset works best with Neural Networks.

# Neural Network Wins!