In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

# from sklearn import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Logistic Regression model

Target varaible: "Music effects"


Predictor variables:
* Hours per day
* While working
* Instrumentalist
* Composer
* Exploratory
* Foreign languages
* BPM

In [None]:
# Read in and prep data for model
df = pd.read_csv('mxmh_survey_results.csv')
df = df[['Hours per day', 'While working', 'Instrumentalist', 'Composer', 'Exploratory', 'Foreign languages', 'BPM', 'Music effects']]
df['Music effects'] = df['Music effects'].replace({'No effect': 'Not improve', 'Worsen': 'Not improve'})
df['Music effects'] = df['Music effects'].map({'Improve': 1, 'Not improve': 0})
df['While working'] = df['While working'].map({'Yes': 1, 'No': 0})
df['Instrumentalist'] = df['Instrumentalist'].map({'Yes': 1, 'No': 0})
df['Composer'] = df['Composer'].map({'Yes': 1, 'No': 0})
df['Foreign languages'] = df['Foreign languages'].map({'Yes': 1, 'No': 0})
df['Exploratory'] = df['Exploratory'].map({'Yes': 1, 'No': 0})
df['BPM'] = (df['BPM'] - df['BPM'].mean()) / df['BPM'].std()
df['Hours per day'] = (df['Hours per day'] - df['Hours per day'].mean()) / df['Hours per day'].std()
df = df.dropna()

In [None]:
df.head()

Unnamed: 0,Hours per day,While working,Instrumentalist,Composer,Exploratory,Foreign languages,BPM,Music effects
2,0.141088,0.0,0.0,0.0,0,1.0,-0.039872,0.0
3,-0.354256,1.0,0.0,1.0,1,1.0,-0.039874,1.0
4,0.141088,1.0,0.0,0.0,1,0.0,-0.039873,1.0
5,0.471317,1.0,1.0,1.0,1,1.0,-0.039874,1.0
6,-0.189142,1.0,1.0,0.0,1,1.0,-0.039874,1.0


In [None]:
def logisticRegression(X, c, eta = 0.001, num = 500):
    X = X.values
    c = c.values
    beta = np.zeros(X.shape[1]+1)
    ones = np.ones(X.shape[0])
    X = np.column_stack((ones, X))
    losses = []
    for i in range(num):
        zi = X @ beta
        p = 1 / (1 + np.exp(-zi))
        dl = np.sum(2 * (c - p) * -p**2 * -np.exp(-zi) * -X.T, axis=1)
        if np.sum(np.sign(beta) != np.sign(beta - eta * dl)) > 2:
            eta /= 2
        beta = beta - eta * dl
        loss = np.sum(dl**2)
        losses.append(loss)
        if loss < 0.00001:
            break
    return pd.DataFrame(beta, index=["Intercept"] + df.columns.drop('Music effects').tolist(), columns=["betas"])

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

In [None]:
train = df.sample(frac=0.5)
test = df.drop(train.index)
X_train = train.drop('Music effects', axis=1)
y_train = train['Music effects']
X_test = test.drop('Music effects', axis=1)
XTestIndex = X_test.index
y_test = test['Music effects']
onesTest = np.ones((X_test.shape[0], 1))
X_test = np.concatenate((onesTest, X_test), axis=1)

betas = logisticRegression(X_train, y_train)

In [None]:
probabilities = X_test @ betas
probabilities = 1 / (1 + np.exp(-probabilities))
trainingvc = y_train.value_counts(normalize=True)
predictions = np.where(probabilities >= trainingvc[0], 1, 0)

In [None]:
accuracy(y_test, predictions.T[0])

0.7669902912621359

In [None]:
X = df.drop('Music effects', axis=1)
y = df['Music effects']

In [None]:
betas = logisticRegression(X, y)
betas

Unnamed: 0,betas
Intercept,0.210048
Hours per day,-0.02906
While working,0.603468
Instrumentalist,0.294675
Composer,0.214645
Exploratory,0.583201
Foreign languages,-0.136969
BPM,-0.171459


# Mining Association Rules

Finding which listened to combinations of music have an association with reported mental health conditions

Minimum Support: 0.15

Minimum Confidence: 0.4

Using Apriori to find frequent itemsets and generate association rules such that music genres are on the left side and mental health conditions are on the right side

In [None]:
df = pd.read_csv('mxmh_survey_results.csv')

FileNotFoundError: ignored

In [None]:
# Data Cleaning
for i in df.columns[12:27]:
    df[i] = np.where(df[i].isin(["Never", "Rarely"]), 0, 1)
for i in df.columns[27:31]:
    df[i] = np.where(df[i] >= 7, 1, 0)
df.head()

In [None]:
# All functions
def getSubsets(s, n):
    return list(itertools.combinations(s, n))

def freqItems(data, items, minSup):
    freq_items = {}
    for i in items:
        sup = data[i].sum() / len(data.index)
        if sup >= minSup:
            freq_items.update({i:sup})
    return freq_items

def skyline(itemset, freq_itemsets):
    if isinstance(itemset, int):
        for other_itemset in freq_itemsets:
            if other_itemset != itemset and itemset in other_itemset:
                return False

    else:
        for other_itemset in freq_itemsets:
            if other_itemset != itemset and set(itemset).issubset(set(other_itemset)):
                return False
    return True

def candidateGen(F, k):
    C = set()
    f = tuple(F)
    for i in range(len(f)):
        for j in range(i+1, len(f)):
            if k == 1:
                if i != j:
                    temp = set()
                    temp.add(f[i])
                    temp.add(f[j])
                    c = tuple(temp)
                    flag = True
                    for s in c:
                        if s not in F:
                            flag = False
                    if flag == True:
                        C.add(c)

            elif len(f[i]) == len(f[j]) and len(f[i]) == k:
                if len(set(f[i]).union(set(f[j]))) == len(f[i]) + 1:
                    c = tuple(set(f[i]).union(set(f[j])))
                    flag = True
                    for s in getSubsets(c, len(c)-1):
                        if s not in F:
                            flag = False
                    if flag == True:
                        C.add(c)
    return C

def Apriori(T, I, minSup):
    F = []
    F.append(freqItems(T, I, minSup))
    k = 2
    while True:
        count = []
        C = candidateGen(F[k-2],k-1)

        for c in range(len(C)):
            count.append((T[list(list(C)[c])] == 1).all(axis=1).sum())
        F.append(dict())

        for c in range(len(C)):
            sup = (count[c] / len(T))
            if sup >= minSup:
                F[k-1].update({tuple(C)[c]: sup})

        if not F[k-1]:
            F.pop()
            break

        for item in list(F[k-2]):
            if not skyline(item, F[k-1]):
                F[k-2].pop(item)

        k += 1
    return F

def GenRules(F, minConf, T):
    final = []
    mylen = len(T)
    for dicts in F:
        # H = (1, [])
        for f in dicts:
            if isinstance(f, tuple):
                for s in f:
                    myconf = (dicts.get(f)/(T[s].sum()/mylen))
                    if myconf >= minConf:
                        remain = tuple(item for item in f if item != s)
                        final.append((remain, s, myconf, dicts.get(f)))
    return final

In [None]:
# Obtaining frequent itemsets and association rules
freq_sets = Apriori(df, df.columns[12:31], 0.15)
ars = GenRules(freq_sets, 0.4, df)

In [None]:
# Output frequent itemsets
data = []
for d in freq_sets:
    for key in d:
        item = key
        support = d.get(key)
        data.append([item, support])

column_names = ['Item(s)', 'Support']
freq_df = pd.DataFrame(data, columns = column_names)
freq_df

In [None]:
# Output association rules prior to filtering
data = []
for ar in ars:
    left = ar[0]
    right = ar[1]
    confidence = ar[2]
    support = ar[3]
    data.append([left, right, confidence, support])

column_names = ['Left Side', 'Right Side', 'Confidence', 'Support']
final_df = pd.DataFrame(data, columns = column_names)
final_df

In [None]:
# Filtering association rules
final_df = final_df[final_df['Right Side'].isin(["Depression", "Anxiety", "Insomnia", "OCD"]) == False]
final_df = final_df[final_df['Left Side'].apply(lambda x: any(value in x for value in ["Depression", "Anxiety", "Insomnia", "OCD"]))]
final_df