In [1]:
import pandas as pd
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import tree

In [2]:
FILES_PATH = "../data/"

In [3]:
dataset_2_25 = pd.read_excel(FILES_PATH + "Dataset2_25%.xlsx")
dataset_2_50 = pd.read_excel(FILES_PATH + "Dataset2_50%.xlsx")
dataset_2_75 = pd.read_excel(FILES_PATH + "Dataset2_75%.xlsx")
dataset_2_100 = pd.read_excel(FILES_PATH + "Dataset2_100%.xlsx")

In [4]:
all_data = {
    "25%" : dataset_2_25,
    "50%" : dataset_2_50,
    "75%" : dataset_2_75,
    "100%" : dataset_2_100
}

In [9]:
all_features_2 = ["assign_view", "assign_submit", "quiz_attempt", "quiz_submit"
                ,"quiz_view", "forum_part", "forum_view", "resource_view",
                "folder_view", "url_view", "AM+", "AM-", "PM+", "PM-"]

In [10]:
features_selected = ["assign_view", "assign_submit", "quiz_attempt", "quiz_submit"
                    ,"quiz_view","PM+", "PM-", "TDS", "TDA", "ADS"]

In [28]:
final_result = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])
for dataset in all_data.keys():
    evaluation = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])
    #80% para entrenamiento y 20% para evaluación
    train_set = all_data[dataset].iloc[:2308]
    test_set  = all_data[dataset].iloc[2308:]

    #preparando la validacion cruzada
    y = train_set.copy().pop("status")
    X = train_set.copy().values
    ss = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
    Kfold = {}
    for i, (train_index, test_index) in enumerate(ss.split(X)):
        Kfold[i] = {
            "train" : train_index,
            "test" : test_index
        }

    for i in Kfold.keys():
        train = train_set.iloc[Kfold[i]["train"]]
        test = train_set.iloc[Kfold[i]["test"]]
        model = RandomForestClassifier(max_depth=4,criterion="entropy")
        model.fit(train[features_selected], train["status"])
        pred = model.predict(test[features_selected])
        accuracy = accuracy_score(test["status"], pred)
        precision = precision_score(test["status"], pred)
        recall = recall_score(test["status"],pred)
        f1 = f1_score(test["status"],pred)


        evaluation = pd.concat([evaluation, pd.DataFrame({"accuracy" : accuracy, 
                                                          "precision" : precision, 
                                                          "recall": recall, 
                                                          "f1": f1}, index = [0])], ignore_index=True)

    final_result = pd.concat([final_result, pd.DataFrame({
        "accuracy" : evaluation["accuracy"].mean(),
        "precision" : evaluation["precision"].mean(),
        "recall" : evaluation["recall"].mean(),
        "f1" : evaluation["f1"].mean()
    }, index = [0])], ignore_index =True)

In [29]:
final_result.index = all_data.keys()

In [30]:
final_result

Unnamed: 0,accuracy,precision,recall,f1
25%,0.825108,0.826575,0.994762,0.902849
50%,0.825541,0.826632,0.995255,0.903096
75%,0.827273,0.827511,0.996318,0.90406
100%,0.82684,0.828597,0.993677,0.903615


In [31]:
final_result.to_json(FILES_PATH + "RF_METRICAS_2_fs.json")