In [1]:
#libraries required: numpy, scikit-learn

import sys
!{sys.executable} -m pip install numpy scikit-learn

In [2]:
#importing modules
import numpy as np                                              #for loading dataset and handling arrays

from sklearn.ensemble import RandomForestClassifier as rfc      #for using random forest classifier model to predict
from sklearn import tree                                        #for using decision tree model to predict
from sklearn.linear_model import LogisticRegression as lr       #for using logistic regression model to predict
from sklearn import svm                                         #for using support vector machines model to predict
from sklearn.neighbors import KNeighborsClassifier              #for using k-nearest neighbour model to predict
from sklearn.model_selection import train_test_split            #for dividing dataset into training and testing 
from sklearn.metrics import accuracy_score                      #for calculating the accuracy score of models
from sklearn.metrics import precision_score                     #for calculating the precision score of models
from sklearn.metrics import recall_score                        #for calculating the recall score of models
from sklearn.metrics import f1_score                            #for calculating the f1 score of models

import time                                                     #for calculating time of training and testing of model

In [3]:
def LogisticRegression():
    
    #loading dataset
    data = np.loadtxt("dataset.csv", delimiter = ",")
    
    #seperate features and labels, 1-30 are features and 31 is result (label)
    x = data[: , :-1]
    y = data[: , -1]
    
    #Seperating training features, testing features, training labels & testing labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    #here 20% data is for testing
    #x variables contain features and y contains results
    
    print("Training a logistic regression model on given dataset")
    start = time.time()                                         #store the start time for training and testing of model
    classifier = lr()                                           #using logistic regression model
    print("Logistic regression classifier created.")
    print("Beginning model training.")
    classifier.fit(x_train, y_train)                            #train the model
    print("Model training completed.")
    predictions = classifier.predict(x_test)                    #do predictions on the model for testing data
    print("Predictions on testing data computed.")
    end = time.time ()                                          #store the end time for training and testing of model
    accuracy = 100.0 * accuracy_score(y_test, predictions)
    print("The accuracy of your logistic regression model on testing data is: " + str(accuracy) + " %")
    f1score = f1_score (y_test, predictions)
    print ("The f1-score of your logistic regression model on testing data is: " + str (f1score))
    precision = precision_score (y_test, predictions)
    print ("The precision of your logistic regression model on testing data is: " + str (precision))
    recall = recall_score (y_test, predictions)
    print ("The recall of your logistic regression model on testing data is: " + str (recall))
    runtime = end - start
    print ("Total time taken for training and testing by logistic regression model is: " + str (runtime) + " s")

In [4]:
def DecisionTree():
    
    #loading dataset
    data = np.loadtxt("dataset.csv", delimiter = ",")
    
    #seperate features and labels, 1-30 are features and 31 is result (label)
    x = data[: , :-1]
    y = data[: , -1]
    
    #Seperating training features, testing features, training labels & testing labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    #here 20% data is for testing
    #x variables contain features and y contains results
    
    print("Training a decision tree model on given dataset")
    start = time.time ()
    classifier = tree.DecisionTreeClassifier()
    print("Decision tree classifier created.")
    print("Beginning model training.")
    classifier.fit(x_train, y_train)
    print("Model training completed.")
    predictions = classifier.predict(x_test)
    print("Predictions on testing data computed.")
    end = time.time ()
    accuracy = 100.0 * accuracy_score(y_test, predictions)
    print("The accuracy of your decision tree model on testing data is : " + str(accuracy) + " %")
    f1score = f1_score (y_test, predictions)
    print ("The f1-score of your decision tree model on testing data is: " + str (f1score))
    precision = precision_score(y_test, predictions)
    print("The precision of your decision tree model on testing data is: " + str(precision))
    recall = recall_score(y_test, predictions)
    print ("The recall of your decision tree model on testing data is: " + str (recall))
    runtime = end - start
    print ("Total time taken for training and testing by decision tree model is: " + str(runtime) + " s")

In [5]:
def RandomForestClassifer():
    
    #loading dataset
    data = np.loadtxt("dataset.csv", delimiter = ",")
    
    #seperate features and labels, 1-30 are features and 31 is label
    x = data[: , :-1]
    y = data[: , -1]
    
    #Seperating training features, testing features, training labels & testing labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    #here 20% data is for testing
    #x variables contain features and y contains results
    
    print("Training a random forest model on given dataset")
    start = time.time()                                         #store the start time for training and testing of model
    classifier = rfc()                                          #using random forest classifier model
    print("Random Forest classifier created.")
    print("Beginning model training.")
    classifier.fit(x_train, y_train)                            #train the model
    print("Model training completed.")
    predictions = classifier.predict(x_test)                    #do predictions on the model for testing data
    print("Predictions on testing data computed.")
    end = time.time ()                                          #store the end time for training and testing of model
    accuracy = 100.0 * accuracy_score(y_test, predictions)      #calculate accuracy of the model and store it in 'score' variable
    print("The accuracy of your random forest model on testing data is: " + str(accuracy) + " %")
    f1score = f1_score (y_test, predictions)            #calculate f1 score of the model and store it in 'f1score' variable
    print ("The f1-score of your random forest model on testing data is: " + str (f1score))
    precision = precision_score (y_test, predictions)   #calculate precision score of the model and store it in 'precision' variable
    print ("The precision of your random forest model on testing data is: " + str (precision))
    recall = recall_score (y_test, predictions)         #calculate recall score of the model and store it in 'recall' variable
    print ("The recall of your random forest model on testing data is: " + str (recall))
    runtime = end - start                                       #calculate and store total time taken for training and testing of model
    print ("Total time taken for training and testing by random forest model is: " + str (runtime) + " s")

In [6]:
def SupportVectorMachines():
    
    #loading dataset
    data = np.loadtxt("dataset.csv", delimiter = ",")
    
    #seperate features and labels, 1-30 are features and 31 is result
    x = data[: , :-1]
    y = data[: , -1]
    
    #Seperating training features, testing features, training labels & testing labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    #here 20% data is for testing
    #x variables contain features and y contains results
    
    print("Training a support vector machine model on given dataset")
    start = time.time ()
    classifier = svm.SVC()
    print("Support Vector Machines created.")
    print("Beginning model training.")
    classifier.fit(x_train, y_train)
    print("Model training completed.")
    predictions = classifier.predict(x_test)
    print("Predictions on testing data computed.")
    end = time.time ()
    accuracy = 100.0 * accuracy_score(y_test, predictions)
    print("The accuracy of your support vector machines model on testing data is: " + str(accuracy) + " %")
    f1score = f1_score (y_test, predictions)
    print ("The f1-score of your support vector machines model on testing data is: " + str (f1score))
    precision = precision_score (y_test, predictions)
    print ("The precision of your support vector machines model on testing data is: " + str (precision))
    recall = recall_score (y_test, predictions)
    print ("The recall of your support vector machines model on testing data is: " + str (recall))
    runtime = end - start
    print ("Total time taken for training and testing by support vector machines model is: " + str (runtime) + " s")

In [7]:
def KNearestNeighbour():
    
    #loading dataset
    data = np.loadtxt("dataset.csv", delimiter = ",")
    
    #seperate features and labels, 1-30 are features and 31 is result
    x = data[: , :-1]
    y = data[: , -1]
    
    #Seperating training features, testing features, training labels & testing labels
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    #here 20% data is for testing
    #x variables contain features and y contains results
    
    print("Training a k nearest neighbours model on given dataset")
    start = time.time ()
    classifier = KNeighborsClassifier()
    print("K-Nearest Neighbours created.")
    print("Beginning model training.")
    classifier.fit(x_train, y_train)
    print("Model training completed.")
    predictions = classifier.predict(x_test)
    print("Predictions on testing data computed.")
    end = time.time ()
    accuracy = 100.0 * accuracy_score(y_test, predictions)
    print("The accuracy of your k-nearest neighbours model on testing data is: " + str(accuracy) + " %")
    f1score = f1_score (y_test, predictions)
    print ("The f1-score of your k-nearest neighbours model on testing data is: " + str (f1score))
    precision = precision_score (y_test, predictions)
    print ("The precision of your k-nearest neighbours model on testing data is: " + str (precision))
    recall = recall_score (y_test, predictions)
    print ("The recall of your k-nearest neighbours model on testing data is: " + str (recall))
    runtime = end - start
    print ("Total time taken for training and testing by k-nearest neighbours model is: " + str (runtime) + " s")

In [8]:
#calling for using logistic regression model
LogisticRegression()

Training a logistic regression model on given dataset
Logistic regression classifier created.
Beginning model training.
Model training completed.
Predictions on testing data computed.
The accuracy of your logistic regression model on testing data is: 74.02597402597402 %
The f1-score of your logistic regression model on testing data is: 0.5833333333333334
The precision of your logistic regression model on testing data is: 0.6222222222222222
The recall of your logistic regression model on testing data is: 0.5490196078431373
Total time taken for training and testing by logistic regression model is: 0.05338430404663086 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
#calling for using decision tree model
DecisionTree()

Training a decision tree model on given dataset
Decision tree classifier created.
Beginning model training.
Model training completed.
Predictions on testing data computed.
The accuracy of your decision tree model on testing data is : 67.53246753246754 %
The f1-score of your decision tree model on testing data is: 0.5689655172413792
The precision of your decision tree model on testing data is: 0.6111111111111112
The recall of your decision tree model on testing data is: 0.532258064516129
Total time taken for training and testing by decision tree model is: 0.0156252384185791 s


In [10]:
#calling for using random forest model
RandomForestClassifer()

Training a random forest model on given dataset
Random Forest classifier created.
Beginning model training.
Model training completed.
Predictions on testing data computed.
The accuracy of your random forest model on testing data is: 80.51948051948052 %
The f1-score of your random forest model on testing data is: 0.6875000000000001
The precision of your random forest model on testing data is: 0.7857142857142857
The recall of your random forest model on testing data is: 0.6111111111111112
Total time taken for training and testing by random forest model is: 0.36982035636901855 s


In [11]:
#calling for using support vctor machines model
SupportVectorMachines()

Training a support vector machine model on given dataset
Support Vector Machines created.
Beginning model training.
Model training completed.
Predictions on testing data computed.
The accuracy of your support vector machines model on testing data is: 67.53246753246754 %
The f1-score of your support vector machines model on testing data is: 0.40476190476190477
The precision of your support vector machines model on testing data is: 0.5151515151515151
The recall of your support vector machines model on testing data is: 0.3333333333333333
Total time taken for training and testing by support vector machines model is: 0.02213430404663086 s


In [12]:
#calling for using k-nearest neighbours model
KNearestNeighbour()

Training a k nearest neighbours model on given dataset
K-Nearest Neighbours created.
Beginning model training.
Model training completed.
Predictions on testing data computed.
The accuracy of your k-nearest neighbours model on testing data is: 72.07792207792207 %
The f1-score of your k-nearest neighbours model on testing data is: 0.5742574257425742
The precision of your k-nearest neighbours model on testing data is: 0.7073170731707317
The recall of your k-nearest neighbours model on testing data is: 0.48333333333333334
Total time taken for training and testing by k-nearest neighbours model is: 0.015622854232788086 s
