In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
pd.set_option("display.max_columns", None)
data = pd.read_csv("Data/preprocessed_data.csv", delimiter=",")
data.head(20)

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Admission grade bins,Age group,Unemployment rate bins
0,1,17,5,171,1,1,122.0,1.0,19,12,5,9,127.3,1,0,0,1,1.0,0,20.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout,,18-22,10-15%
1,1,15,1,9254,1,1,160.0,1.0,1,3,3,3,142.5,1,0,0,0,1.0,0,19.0,0,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate,,18-22,10-15%
2,1,1,5,9070,1,1,122.0,1.0,37,37,9,9,124.8,1,0,0,0,1.0,0,19.0,0,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout,,18-22,10-15%
3,1,17,2,9773,1,1,122.0,1.0,38,37,5,3,119.6,1,0,0,1,0.0,0,20.0,0,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate,,18-22,5-10%
4,2,39,1,8014,0,1,100.0,1.0,37,38,9,9,141.5,0,0,0,1,0.0,0,34.0,0,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate,,28-35,10-15%
5,2,39,1,9991,0,19,133.1,1.0,37,37,9,7,114.8,0,0,1,1,1.0,0,34.0,0,0,5,10,5,11.857143,0,0,5,17,5,11.5,5,16.2,0.3,-0.92,Graduate,,28-35,15%+
6,1,1,1,9500,1,1,142.0,1.0,19,38,7,10,128.4,1,0,0,1,0.0,1,18.0,0,0,7,9,7,13.3,0,0,8,8,8,14.345,0,15.5,2.8,-4.06,Graduate,,18-22,15%+
7,1,18,4,9254,1,1,119.0,1.0,37,37,9,9,113.1,1,0,0,0,1.0,0,22.0,0,0,5,5,0,0.0,0,0,5,5,0,0.0,0,15.5,2.8,-4.06,Dropout,,18-22,15%+
8,1,1,3,9238,1,1,137.0,62.0,1,1,9,9,129.3,0,0,0,1,0.0,1,21.0,1,0,6,8,6,13.875,0,0,6,7,6,14.142857,0,16.2,0.3,-0.92,Graduate,,18-22,15%+
9,1,1,1,9238,1,1,138.0,1.0,1,19,4,7,123.0,1,0,1,0,0.0,0,18.0,0,0,6,9,5,11.4,0,0,6,14,2,13.5,0,8.9,1.4,3.51,Dropout,,18-22,5-10%


In [10]:
from mlxtend.frequent_patterns import apriori, association_rules
import time

# Handpick categorical columns from dataset, may bin numerical variables and perform apriori on them later
categorical_columns = [
    'Marital status', 'Application mode', 'Application order', 'Course',
    'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
    "Mother's qualification", "Father's qualification", "Mother's occupation",
    "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor',
    'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International', 'Target',
    'Age group', 'Unemployment rate bins'
]

# Convert categorical columns to string to treat them as categorical
for col in categorical_columns:
    data[col] = data[col].astype(str)

# Preprocessing: Encode the categorical data to be suitable for the Apriori algorithm
# We will encode the data using one-hot encoding
encoded_data = pd.get_dummies(data)

encoded_data = encoded_data.drop(columns=["Previous qualification (grade)", "Admission grade",
                           "Age at enrollment", "Curricular units 1st sem (credited)",
                           "Curricular units 1st sem (enrolled)", "Curricular units 1st sem (evaluations)",
                           "Curricular units 1st sem (approved)", "Curricular units 1st sem (grade)",
                           "Curricular units 1st sem (without evaluations)", "Curricular units 2nd sem (credited)",
                           "Curricular units 2nd sem (enrolled)", "Curricular units 2nd sem (evaluations)",
                           "Curricular units 2nd sem (approved)", "Curricular units 2nd sem (grade)",
                           "Curricular units 2nd sem (without evaluations)", "Unemployment rate",
                           "Inflation rate", "GDP"])

#print(encoded_data.isnull().values.any())
#print(encoded_data)

start_time = time.time()
# Applying the Apriori algorithm with minimum support threshold
frequent_itemsets = apriori(encoded_data, min_support=0.05, max_len=2, use_colnames=True, verbose=1)

# Generating the association rules with minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.70)
print("--- %s seconds ---" % (time.time() - start_time))

# Displaying the resulting rules
#pd.set_option("display.max_rows", None)

sorted_rules = rules.sort_values(by='confidence', ascending=False)
display(sorted_rules)

sorted_rules.to_csv("Data/found_rules_lol.csv", index=False)

Processing 156 combinations | Sampling itemset size 2
--- 0.009927034378051758 seconds ---


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
30,(Nacionality_1.0),(International_0),0.956187,0.97376,0.956187,1.0,1.026947,0.02509,inf,0.598901
7,(Age group_18-22),(Marital status_1),0.686567,0.885412,0.681512,0.992637,1.121102,0.073617,15.562189,0.344637
9,(Application order_1),(Educational special needs_0),0.683438,0.988686,0.676938,0.99049,1.001825,0.001233,1.189688,0.005753
37,(Scholarship holder_0),(Educational special needs_0),0.751324,0.988686,0.744102,0.990388,1.001722,0.001279,1.177074,0.006911
34,(Debtor_0),(Educational special needs_0),0.8883,0.988686,0.87843,0.988889,1.000206,0.000181,1.018296,0.00184
24,(Nacionality_1.0),(Educational special needs_0),0.956187,0.988686,0.945354,0.988671,0.999985,-1.4e-05,0.998684,-0.000344
39,(International_0),(Educational special needs_0),0.97376,0.988686,0.962687,0.988628,0.999942,-5.6e-05,0.994929,-0.002218
35,(Tuition fees up to date_1),(Educational special needs_0),0.880597,0.988686,0.870005,0.987972,0.999278,-0.000629,0.940638,-0.006016
32,(Displaced_1),(Educational special needs_0),0.547424,0.988686,0.540684,0.987687,0.99899,-0.000547,0.918891,-0.002229
18,(Previous qualification_1),(Educational special needs_0),0.837987,0.988686,0.827636,0.987647,0.99895,-0.00087,0.91594,-0.006448
