In [1]:
import pandas as pd
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import linear_model

In [2]:
FILES_PATH = "../data/"

In [3]:
dataset_1_25 = pd.read_excel(FILES_PATH + "Dataset1_25%.xlsx")
dataset_1_50 = pd.read_excel(FILES_PATH + "Dataset1_50%.xlsx")
dataset_1_75 = pd.read_excel(FILES_PATH + "Dataset1_75%.xlsx")
dataset_1_100 = pd.read_excel(FILES_PATH + "Dataset1_100%.xlsx")

In [4]:
all_data = {
    "25%" : dataset_1_25,
    "50%" : dataset_1_50,
    "75%" : dataset_1_75,
    "100%" : dataset_1_100
}

In [5]:
all_features_1 = ["Tarea",	"Glosario",	"Cuestionario",	"Foro",	"Carpeta",	"Recurso",	"URL"]

In [6]:
features_selected = ["Tarea", "Cuestionario", "Carpeta"]

In [20]:
final_result = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])

for dataset in all_data.keys():
    evaluation = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1"])
    #80% para entrenamiento y 20% para evaluación
    train_set = all_data[dataset].iloc[:2308]
    test_set  = all_data[dataset].iloc[2308:]

    #preparando la validacion cruzada
    y = train_set.copy().pop("status")
    X = train_set.copy().values
    ss = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)
    Kfold = {}
    for i, (train_index, test_index) in enumerate(ss.split(X)):
        Kfold[i] = {
            "train" : train_index,
            "test" : test_index
        }

    for i in Kfold.keys():
        train = train_set.iloc[Kfold[i]["train"]]
        test = train_set.iloc[Kfold[i]["test"]]
        model = linear_model.LinearRegression()
        model.fit(train[all_features_1], train["status"])
        pred = model.predict(test[all_features_1])
        print(pred)
        y_pred_clasificacion = (pred > 0.8).astype(int)
        accuracy = accuracy_score(test["status"], y_pred_clasificacion)
        precision = precision_score(test["status"], y_pred_clasificacion)
        recall = recall_score(test["status"],y_pred_clasificacion)
        f1 = f1_score(test["status"],y_pred_clasificacion)
        evaluation = pd.concat([evaluation, pd.DataFrame({"accuracy" : accuracy, 
                                                            "precision" : precision, 
                                                            "recall": recall, 
                                                            "f1": f1}, index = [0])], ignore_index=True)

    final_result = pd.concat([final_result, pd.DataFrame({
        "accuracy" : evaluation["accuracy"].mean(),
        "precision" : evaluation["precision"].mean(),
        "recall" : evaluation["recall"].mean(),
        "f1" : evaluation["f1"].mean()
    }, index = [0])], ignore_index =True)

[ 0.82086963  0.80478589  0.78870215  0.82086963  0.81384308  0.8139766
  0.81359608  0.42815606  0.80019054  0.81321071  0.82916093  0.77633613
  0.82010374  0.80555178  0.81091303  0.82086963  0.76090944  0.78688175
  0.82086963  0.8139766   0.81091303  0.80402     0.77814074  0.81429279
  0.82118582  0.81244481  0.77515772  0.81014714  0.81167892  0.80708357
  0.8139766   0.82195171  0.81812225  0.79559518  0.81474249  0.81474249
  0.82086963  0.81091303  0.82086963  0.91140862  0.80938124  0.9208993
  0.82086963  0.81321071  0.82866055  0.82427854  0.80938124  0.81474249
  0.80172232  0.82086963  0.81690665  0.84970651  0.80861535  0.80325411
  0.82086963  0.8227176   0.81321071  0.74766613  0.81812225  0.83256498
  0.82086963  0.80272603  0.81167892  0.66139478  0.80861535  0.81474249
  0.81014714  0.81046333  0.80172232  0.84925681 -0.07995748  0.81167892
  0.79559518  0.79712697  0.82086963  0.81550838  0.82010374  0.80563233
  0.81244481  0.79555144  0.80248821  0.82086963  0.8

In [21]:
final_result.index = all_data.keys()

In [22]:
final_result

Unnamed: 0,accuracy,precision,recall,f1
25%,0.664502,0.841331,0.728835,0.778857
50%,0.561039,0.838568,0.577324,0.67627
75%,0.546753,0.836072,0.557135,0.661097
100%,0.534632,0.820567,0.552311,0.657822


In [23]:
final_result.to_json(FILES_PATH + "LR_METRICAS_1.json")