| Universidade de Évora<br/>Ano Letivo 2020/2021<br/>Aprendizagem Automática - Trabalho 2 |
| ------------ |
| Trabalho realizado por:<br/>José Lopes nº 37861<br/>Dinis Matos nº 42738 |

Este trabalho tem como objetivo construir um modelo preditivo em relação aos alunos que estão em risco de abandonar o curso. Os dados a serem analisados são todos da universidade, como por exemplo os vários tipos diferentes de cursos em que os alunos estão inscritos, o número de pessoas inscritas nas cadeiras desses cursos e a nota que obtiveram.

Foram utilizados 4 tipos diferentes de modelos preditivos: Random Forest, Gradient Tree Boosting, Decision Tree e Weighted Average Probabilities (Soft Voting). O mais bem sucedido foi o Random Forest, que obteve um score de 93% e 95%. De seguida foi a Weighted Average Probabilities (Soft Voting) com 92%, depois foi Gradient Tree Boosting com 91%. Por último o Decision Tree com 90% (ou seja, a ordem dos 5 melhores Setups: 1ª/2ª/5º/3º/4º, de acordo com o código)

O "output" será criado ou guardado no ficheiro "result.csv".


In [None]:
# Imports
import pandas as pd
import sklearn as sk
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

# removeDuplicatesFromDict removes duplicates from Dictionary
def removeDuplicatesFromDict(dictionary):
    aux = [] 
    new_dict = dict()
    for x, y in dictionary.items(): 
        if y not in aux: 
            aux.append(y) 
            new_dict[x] = y
    return new_dict

# getTrainAndTest returns (x_train, y_train, x_test) from (train.csv & test.csv)
def getTrainAndTest():
    # Open train
    train = pd.read_csv('train.csv')
    train = train.astype({"Program": "string"})

    # Get Dictionary (convert Program to number)
    dict_program = train.Program.to_dict()
    newdict_program = removeDuplicatesFromDict(dict_program)
    finaldict_program = {v: k for k, v in newdict_program.items()}

    # Change train
    train_program = [finaldict_program[x] for x in train.Program]
    train.Program = train_program

    # Set y_train & x_train
    y_train = train.Failure
    x_train = train.drop('Failure',axis=1)

    # Open x_test
    x_test = pd.read_csv('test.csv')
    x_test = x_test.astype({"Program": "string"})

    # Change test
    x_test_program = [finaldict_program[x] for x in x_test.Program]
    x_test.Program = x_test_program
    
    return x_train, y_train, x_test

In [None]:
# 1º Setup: Random Forests (n_estimators=100, max_depth>=20) (0.95833)
def getPredict(x_train, y_train, x_test):
    RF = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=0)
    RF.fit(x_train, y_train)
    y_pred = RF.predict(x_test)
    return y_pred

In [None]:
# main function
def main():
    
    # get x_train, y_train, x_test by getTrainAndTest
    x_train, y_train, x_test = getTrainAndTest()
    
    # get y_pred
    y_pred = getPredict(x_train, y_train, x_test)

    # Open & Change sampleSubmission
    result = pd.DataFrame({'Id': x_test.Id, 'Failure': y_pred})
    result.to_csv('result.csv', index = False)
    print("File result.csv created/changed with the prediction\n")

if __name__ == "__main__":
    main()

In [None]:
# 2º Setup: Random Forests (n_estimators=100, max_depth=5) (0.93706)
def getPredict(x_train, y_train, x_test):
    RF = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
    RF.fit(x_train, y_train)
    y_pred = RF.predict(x_test)
    return y_pred

In [None]:
# 3º Setup: Gradient Tree Boosting (n_estimators=100, max_depth=100, random_state=0) (0.91666)
def getPredict(x_train, y_train, x_test):
    GBC = GradientBoostingClassifier(n_estimators=100, max_depth=100, random_state=0)
    GBC.fit(x_train, y_train)
    y_pred = GBC.predict(x_test)
    return y_pred

In [None]:
# 4º Setup: Decision Tree (max_depth=20) (0.90410)
def getPredict(x_train, y_train, x_test):
    DT = DecisionTreeClassifier(max_depth=20)
    DT = DT.fit(x_train, y_train)
    y_pred = DT.predict(x_test)
    return y_pred

In [None]:
# 5ª Setup: Weighted Average Probabilities (Soft Voting) (0.92307)
def getPredict(x_train, y_train, x_test):
    DT = DecisionTreeClassifier(max_depth=20)
    KNN = KNeighborsClassifier(n_neighbors=7)
    SV = SVC(kernel='rbf', probability=True)
    VC = VotingClassifier(estimators=[('dt', DT), ('knn', KNN), ('svc', SV)], voting='soft', weights=[2, 1, 2])
    
    DT = DT.fit(x_train, y_train)
    KNN = KNN.fit(x_train, y_train)
    SV = SV.fit(x_train, y_train)
    VC = VC.fit(x_train, y_train)

    y_pred = VC.predict(x_test)
    return y_pred