In [1]:
import pandas as pd
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import linear_model

In [2]:
FILES_PATH = "../data/"

In [3]:
dataset_2_25 = pd.read_excel(FILES_PATH + "Dataset2_25%.xlsx")
dataset_2_50 = pd.read_excel(FILES_PATH + "Dataset2_50%.xlsx")
dataset_2_75 = pd.read_excel(FILES_PATH + "Dataset2_75%.xlsx")
dataset_2_100 = pd.read_excel(FILES_PATH + "Dataset2_100%.xlsx")

In [4]:
all_data = {
    "25%" : dataset_2_25,
    "50%" : dataset_2_50,
    "75%" : dataset_2_75,
    "100%" : dataset_2_100
}

In [5]:
all_features_2 = ["assign_view", "assign_submit", "quiz_attempt", "quiz_submit"
                ,"quiz_view", "forum_part", "forum_view", "resource_view",
                "folder_view", "url_view", "AM+", "AM-", "PM+", "PM-"]

In [6]:
features_selected = ["assign_view", "assign_submit", "quiz_attempt", "quiz_submit"
                    ,"quiz_view","PM+", "PM-", "TDS", "TDA", "ADS"]

In [11]:
final_result = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])

for dataset in all_data.keys():
    evaluation = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])
    #80% para entrenamiento y 20% para evaluación
    train_set = all_data[dataset].iloc[:2308]
    test_set  = all_data[dataset].iloc[2308:]

    #preparando la validacion cruzada
    y = train_set.copy().pop("status")
    X = train_set.copy().values
    ss = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
    Kfold = {}
    for i, (train_index, test_index) in enumerate(ss.split(X)):
        Kfold[i] = {
            "train" : train_index,
            "test" : test_index
        }

    for i in Kfold.keys():
        train = train_set.iloc[Kfold[i]["train"]]
        test = train_set.iloc[Kfold[i]["test"]]
        model = linear_model.LinearRegression()
        model.fit(train[features_selected], train["status"])
        pred = model.predict(test[features_selected])
        print(pred)
        y_pred_clasificacion = (pred > 0.8).astype(int)
        accuracy = accuracy_score(test["status"], y_pred_clasificacion)
        precision = precision_score(test["status"], y_pred_clasificacion)
        recall = recall_score(test["status"],y_pred_clasificacion)
        f1 = f1_score(test["status"],y_pred_clasificacion)
        evaluation = pd.concat([evaluation, pd.DataFrame({"accuracy" : accuracy, 
                                                            "precision" : precision, 
                                                            "recall": recall, 
                                                            "f1": f1}, index = [0])], ignore_index=True)

    final_result = pd.concat([final_result, pd.DataFrame({
        "accuracy" : evaluation["accuracy"].mean(),
        "precision" : evaluation["precision"].mean(),
        "recall" : evaluation["recall"].mean(),
        "f1" : evaluation["f1"].mean()
    }, index = [0])], ignore_index =True)

[0.77189204 0.88238179 0.71706959 0.44315645 0.86729086 0.83050726
 0.78071212 0.92915413 0.77571744 0.76578161 0.9131592  0.81234859
 0.7989816  0.83772714 0.81055854 0.89706155 0.64473026 0.91351821
 0.7439404  0.82499689 0.91826852 0.72248949 0.82661694 0.87003928
 0.77397942 0.75513328 0.94166129 0.76591355 0.83744762 0.82797081
 0.84048005 0.8783292  0.86697388 0.79265867 0.51969461 0.92062949
 0.89691729 0.90275648 0.85964099 0.88148782 0.87991126 0.85462158
 0.89330163 0.82495656 0.8267141  0.88612754 0.63916596 0.82937642
 0.71665512 0.72046277 0.81429762 0.82889351 0.72449021 0.65687582
 0.75110417 0.92389826 0.85646635 0.83769406 0.90686729 0.77542318
 0.82625337 0.73273517 0.89351637 0.54686705 0.96804087 0.92318078
 0.82881372 0.78061881 0.63307155 0.90251569 0.73179231 0.82175804
 0.78569374 0.84770176 0.87434    0.85342627 0.76346259 0.9433289
 0.91033543 0.58183655 0.8031758  0.78059269 0.76610993 0.81942459
 0.79421955 0.80336051 0.94410234 0.75730002 0.70543727 0.75174

In [12]:
final_result.index = all_data.keys()

In [13]:
final_result

Unnamed: 0,accuracy,precision,recall,f1
25%,0.619048,0.864745,0.633755,0.730874
50%,0.597835,0.879188,0.589057,0.705269
75%,0.59697,0.869063,0.597769,0.707809
100%,0.583983,0.859594,0.587616,0.697706


In [14]:
final_result.to_json(FILES_PATH + "LR_METRICAS_2_fs.json")