In [3]:
#libraries
from google.colab import drive
import os
import pandas as pd
from itertools import combinations, product
from pandas import set_option


In [4]:
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Data Mining + DL/Data Mining/data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Accessing Data:

In [5]:
df = pd.read_csv('clean_data.csv')
df.head(5)

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,No,No,Yes,Often,6-25,No,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,Male,United States,No,No,No,Rarely,More than 1000,No,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,No,No,No,Rarely,6-25,No,Yes,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,No,Yes,Yes,Often,26-100,No,Yes,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,No,No,No,Never,100-500,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


# Creating Rules:

In [6]:
def get_count(data, tuples):
    data = data.copy()
    data['Count'] = data.index
    ordered_categories = [category for (category, value) in tuples]
    counts = data.groupby(ordered_categories).count()['Count']
    count = counts
    for tup in tuples: 
      count = count[tup[1]]
    return count

In [7]:
def get_support(data, tuples):
    support = get_count(data,tuples)
    return support/len(data)

In [8]:
def get_confidence(data, predictors, outcomes):
    numerator = get_support(data,predictors+outcomes)
    denominator = get_support(data, predictors)
    return numerator / denominator

In [9]:
def get_lift(data,tuples):
    deno = 1 
    for tup in tuples: 
      deno *= get_support(data,[tup])
    return get_support(data,tuples) /deno  

In [10]:
def get_category_value_tuples(data, categories):
    tuples = []
    for category in categories:
        for value in data[category].unique():
            tuples += [(category,value)]
    return tuples

In [11]:
response = ["treatment"]
predictor = [col for col in df if col not in response]
length = 2
min_sup = 0.03

In [12]:
outcomes = get_category_value_tuples(df, response)

rows = []
predictor_tuples = {}
for category in predictor: 
    predictor_tuples[category] = [unit for unit in get_category_value_tuples(df, [category]) if get_support(df,[unit]) > min_sup]

for predictor_combination in combinations(predictor,length):

    for relationship in product(*[predictor_tuples[category] for category in list(predictor_combination)], outcomes):

        predictor = list(relationship[0:length])
        outcome = relationship[length]
        
        try:    
            
            support = get_support(df, list(relationship))
        
            if support < min_sup: 
              continue
            
            row = {}

            for i in range(0,length): row["Predictor_%s" % i]  = ": ".join(predictor[i]) 

            row["Outcome"] = ": ".join(list(outcome)) 
            row["Support"] = support
            row["Confidence"] = get_confidence(df, predictor, [outcome])
            row["Lift"] = get_lift(df, predictor + [outcome])
            rows += [row]
            
        except:
            pass



In [13]:
table = pd.DataFrame(rows, columns=["Predictor_" + str(i) for i in range(0,length)] + ["Outcome","Support","Confidence","Lift"])
table 

Unnamed: 0,Predictor_0,Predictor_1,Outcome,Support,Confidence,Lift
0,Gender: Female,Country: United States,treatment: Yes,0.102073,0.711111,1.721087
1,Gender: Female,Country: United States,treatment: No,0.041467,0.288889,0.712702
2,Gender: Male,Country: United States,treatment: Yes,0.218501,0.489286,0.921050
3,Gender: Male,Country: United States,treatment: No,0.228070,0.510714,0.979966
4,Gender: Male,Country: United Kingdom,treatment: Yes,0.057416,0.470588,0.983896
...,...,...,...,...,...,...
2228,mental_vs_physical: Don't know,obs_consequence: No,treatment: Yes,0.183413,0.439771,0.928565
2229,mental_vs_physical: Don't know,obs_consequence: No,treatment: No,0.233652,0.560229,1.205770
2230,mental_vs_physical: No,obs_consequence: No,treatment: Yes,0.097289,0.525862,0.836449
2231,mental_vs_physical: No,obs_consequence: No,treatment: No,0.087719,0.474138,0.768749


In [15]:
set_option('display.max_rows', 500)

idx = (table.Outcome == 'treatment: Yes') & (table.Confidence > 0.5) & (table.Lift > 1)
table[idx].round(2).sort_values("Confidence", ascending=False).head(10)

Unnamed: 0,Predictor_0,Predictor_1,Outcome,Support,Confidence,Lift
685,work_interfere: Often,anonymity: Yes,treatment: Yes,0.04,0.96,2.07
641,work_interfere: Often,benefits: Yes,treatment: Yes,0.04,0.95,1.97
654,work_interfere: Often,care_options: Yes,treatment: Yes,0.05,0.95,2.32
624,work_interfere: Often,remote_work: Yes,treatment: Yes,0.04,0.92,2.13
449,family_history: Yes,work_interfere: Often,treatment: Yes,0.06,0.9,2.52
182,Country: United States,work_interfere: Often,treatment: Yes,0.06,0.9,1.7
19,Gender: Female,work_interfere: Rarely,treatment: Yes,0.03,0.89,2.27
450,family_history: Yes,work_interfere: Rarely,treatment: Yes,0.06,0.89,2.11
769,work_interfere: Often,mental_vs_physical: No,treatment: Yes,0.04,0.88,2.29
747,work_interfere: Often,mental_health_interview: No,treatment: Yes,0.08,0.87,1.7
