# Joe Zoll
# 30 Days of Meditation - Naive Bayes Classifier (Multinomial)
## Intention: Classify meditation instances as > or < 20 minutes based on the frequency of certain types of meditation (i.e. Mindfulness of breathing, Metta, Body scanning, etc.)

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('data/meditation-sit-log.csv')
meditations = df.copy()

In [7]:
df

Unnamed: 0,Name,Date & Time ⏰,Tags,Length (Minutes),Guided
0,Metta Return 1/7,"October 4, 2022 11:02 AM","Body / Grounding Awareness, Metta",40.0,
1,Quick in car,"October 3, 2022 12:13 PM","Body / Grounding Awareness, Mindfulness of Bre...",5.0,
2,What do I need to change for this semester?,"October 2, 2022 9:47 PM",Contemplation,30.0,
3,Cloudy,"October 2, 2022 3:44 PM",,11.0,
4,Calm cleaning,"October 1, 2022 2:17 PM",Body / Grounding Awareness,17.0,
...,...,...,...,...,...
115,First SP Metta Session | Opening up... somewhe...,"July 8, 2021 2:32 PM","MIDL 03/52, MIDL Metta Loved One",47.0,
116,Sitting With Pain,"July 7, 2021 3:58 PM","Doing Nothing, Stillness",51.0,
117,More Courageous Meditation. Not mine,"July 1, 2021 3:34 PM","MIDL 03/52, MIDL Forgiveness",45.0,
118,The Bravest Meditation Of My Life,"June 29, 2021 3:04 PM","MIDL 03/52, MIDL Forgiveness",65.0,


# Data Cleaning
- Only take meditation sits from September 6, 2022 => October 6, 2022
- remove Date & Time
- remove Name
- remove Guided

- Rename Length (Minutes) => >20 minutes and set values to boolean
- Rename Tags => Practice
- fill nan values

In [8]:
meditations = meditations.loc[:22]

In [9]:
meditations = meditations.drop(['Name', 'Date & Time ⏰', 'Guided'], axis=1)

In [10]:
meditations = meditations.rename(columns={'Length (Minutes)': '+20min'})

In [11]:
meditations = meditations.rename(columns={'Tags': 'practice'})

In [12]:
meditations['+20min'] = meditations['+20min'] >= 20

In [13]:
meditations.head(3)

Unnamed: 0,practice,+20min
0,"Body / Grounding Awareness, Metta",True
1,"Body / Grounding Awareness, Mindfulness of Bre...",False
2,Contemplation,True


practice => dict of counts for all 6 meditation practices => mapped to columns in meditations df

In [14]:
def countPracticeTypes(practiceStr, count):
    currPractices = practiceStr.split(',')
    for practice in currPractices:
        if practice in count:
            count[practice] += 1
        else:
            count[practice] = 1
    return count

In [15]:
def cleanPracticeStr(practiceStr):
    return practiceStr.split(',')[0]

In [16]:
meditations['practice'] = meditations['practice'].fillna(value='Body / Grounding Awareness')

# Frequencies => Features
- Clean practice string values (limit to 1 type of practice per instance)
- For each practice, count the frequencies of each time it appears in the dataset and make it's count into its own column

In [17]:
practices = set(meditations['practice'].apply(cleanPracticeStr).unique())
practices.add('Metta')
practices

{'Body / Grounding Awareness',
 'Contemplation',
 'Doing Nothing',
 'Metta',
 'Mindfulness of Breathing',
 'Mindfulness of Fingers Touching',
 'Stillness'}

In [18]:
# I want to add new columns to the DF, all with a value == 0
meditations[list(practices)] = 0
meditations.head(1)

Unnamed: 0,practice,+20min,Mindfulness of Breathing,Contemplation,Doing Nothing,Mindfulness of Fingers Touching,Stillness,Body / Grounding Awareness,Metta
0,"Body / Grounding Awareness, Metta",True,0,0,0,0,0,0,0


In [19]:
# for each row
#    split practiceStr
#    for each str in split:
#        increment corresponding column
for index, row in meditations.iterrows():
    practiceStr = row['practice']
    currPractices = practiceStr.split(',')
    for p in currPractices:
        p = p.strip()
        if p in practices:
            meditations[p][index] += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meditations[p][index] += 1


In [20]:
meditations.insert(len(meditations.columns)-1, '+20min', meditations.pop('+20min'))

In [21]:
meditations.head(7)

Unnamed: 0,practice,Mindfulness of Breathing,Contemplation,Doing Nothing,Mindfulness of Fingers Touching,Stillness,Body / Grounding Awareness,Metta,+20min
0,"Body / Grounding Awareness, Metta",0,0,0,0,0,1,1,True
1,"Body / Grounding Awareness, Mindfulness of Bre...",1,0,0,0,0,1,0,False
2,Contemplation,0,1,0,0,0,0,0,True
3,Body / Grounding Awareness,0,0,0,0,0,1,0,False
4,Body / Grounding Awareness,0,0,0,0,0,1,0,False
5,Body / Grounding Awareness,0,0,0,0,0,1,0,False
6,Mindfulness of Breathing,1,0,0,0,0,0,0,True


In [22]:
meditations = meditations.drop(['practice'], axis=1)

# So now that I have the frequency columns for every type of meditation IN every sit I have done for the past 30 days, I now can proceed with the Naive Bayes Classifier, constructing it, and then inputting some instance to check and see if it worked.

# NEXT TIME, we do the math :D

### Notes
- This is not a great model for NB classifier, as there can never be a sit with a practice type that occurs 2 times
- What is the best classifier for when features are True / False?

# Naive Bayes Classification

- get conditional probabilities for T & F Class Value (2 sets for each value * 7 features => 14 probabilities)


Question
- How do we ask questions, use the answers to generate a new instance, and then put that new instance into the dataframe AND classify it as a long or short sit?

### Getting Training and Testing Sets

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X, y = meditations.drop(['+20min'], axis=1), meditations['+20min']

In [25]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
# init
sample = {'Metta': 1,
 'Mindfulness of Fingers Touching': 0,
 'Body / Grounding Awareness': 1,
 'Stillness': 0,
 'Contemplation': 0,
 'Mindfulness of Breathing': 0,
 'Doing Nothing': 0}

numSits = len(x_train)
longSits = x_train.loc[meditations['+20min'] == True]
shortSits = x_train.loc[meditations['+20min'] != True]

num_longSits = len(longSits)
num_shortSits = numSits - len(longSits)

# Probabitlities

prior_long = num_longSits / numSits
prior_short = num_shortSits / numSits

In [27]:
sample

{'Metta': 1,
 'Mindfulness of Fingers Touching': 0,
 'Body / Grounding Awareness': 1,
 'Stillness': 0,
 'Contemplation': 0,
 'Mindfulness of Breathing': 0,
 'Doing Nothing': 0}

In [28]:
len(longSits.loc[longSits['Metta'] == 1])
num_longSits

15

Now I want to get all the likelihoods for this single instance 'sample'

In [29]:

# for each p in sample_meds:
#    compute likelihood
#    include likelihood in samplesample_meds

#for p in sample_meds:
sample_meds
    # compue likelihood
    

NameError: name 'sample_meds' is not defined

In [None]:
sample_meds = [key for (key, value) in sample.items() if value == 1]


# P(True) * P(Metta | True) * P(GA | True)
p1 = prior_long
for p in sample_meds:
    p1 *= (len(longSits.loc[longSits[p] == 1]) / num_longSits)
print(p1)
# P(False) * P(Metta | False) * P(GA | False)
p2 = prior_short
for p in sample_meds:
    p2 *= (len(shortSits.loc[shortSits[p] == 1]) / num_shortSits)
print(p2)

Now, compare the 2 probabitlities and classify!

In [30]:
sample['+20min'] = True if p1 >= p2 else False

NameError: name 'p1' is not defined

In [31]:
meditations.tail(2)

Unnamed: 0,Mindfulness of Breathing,Contemplation,Doing Nothing,Mindfulness of Fingers Touching,Stillness,Body / Grounding Awareness,Metta,+20min
21,0,1,0,0,0,0,0,True
22,0,1,0,0,0,0,0,True


In [32]:
meditations = meditations.append(sample, ignore_index=True)
meditations.tail(3)

  meditations = meditations.append(sample, ignore_index=True)


Unnamed: 0,Mindfulness of Breathing,Contemplation,Doing Nothing,Mindfulness of Fingers Touching,Stillness,Body / Grounding Awareness,Metta,+20min
21,0,1,0,0,0,0,0,True
22,0,1,0,0,0,0,0,True
23,0,0,0,0,0,1,1,


# Now that I have successfully classified a single instance using Bayes theorem, I now shall scale up to being able to classify any instance that I pass through, which is easy.

# After that, I want to test the accuracy of my new model using the training and testing data.

# <font color=green>*how do I evaluate the accuracy manually? In sklearn? What is the visionary output?*</font>

# Joe's Meditation Classifier App:
### "What type of meditation did you practice today?" ____ "You practice +/- 20 minutes during that sit!"

In [51]:
sit_dict = {'Metta': 0,
 'Mindfulness of Fingers Touching': 0,
 'Body / Grounding Awareness': 0,
 'Stillness': 0,
 'Contemplation': 0,
 'Mindfulness of Breathing': 0,
 'Doing Nothing': 0}
meditation_types = [key for (key, value) in start_dict.items()]

In [54]:
for prac in meditation_types:
    ans = input(f"Did you practice {prac}? \t[y/n]: ")
    if ans == 'y': sit_dict[prac] = 1
start_dict

Did you practice Metta? 	[y/n]: y
Did you practice Mindfulness of Fingers Touching? 	[y/n]: y
Did you practice Body / Grounding Awareness? 	[y/n]: y
Did you practice Stillness? 	[y/n]: y
Did you practice Contemplation? 	[y/n]: y
Did you practice Mindfulness of Breathing? 	[y/n]: y
Did you practice Doing Nothing? 	[y/n]: y


{'Metta': 1,
 'Mindfulness of Fingers Touching': 1,
 'Body / Grounding Awareness': 1,
 'Stillness': 1,
 'Contemplation': 1,
 'Mindfulness of Breathing': 1,
 'Doing Nothing': 1}

In [62]:
x = longSits.loc[longSits['Contemplation'] == 1]
print(len(x))
x

5


Unnamed: 0,Mindfulness of Breathing,Contemplation,Doing Nothing,Mindfulness of Fingers Touching,Stillness,Body / Grounding Awareness,Metta
21,0,1,0,0,0,0,0
13,0,1,0,0,0,1,0
2,0,1,0,0,0,0,0
18,0,1,0,0,0,0,0
22,0,1,0,0,0,0,0


In [64]:
used_techniques = [key for (key, value) in sit_dict.items() if value == 1]
alpha = 1 # for ensuring no 0 value probabilities

# P(True) * P(Metta | True) * P(GA | True)
p1 = prior_long
for p in used_techniques:
    print(p1)
    p1 *= ((len(longSits.loc[longSits[p] == 1]) + alpha) / num_longSits)
print(p1)
# P(False) * P(Metta | False) * P(GA | False)
print()
p2 = prior_short
for p in used_techniques:
    print(p2)
    p2 *= ((len(shortSits.loc[shortSits[p] == 1]) + alpha) / num_shortSits)
print(p2)

0.8333333333333334
0.11111111111111112
0.007407407407407408
0.0019753086419753087
0.0005267489711934157
0.0002106995884773663
9.832647462277094e-05
1.311019661636946e-05

0.16666666666666666
0.05555555555555555
0.037037037037037035
0.037037037037037035
0.012345679012345678
0.004115226337448559
0.001371742112482853
0.0004572473708276176


In [65]:
p1, p2

(1.311019661636946e-05, 0.0004572473708276176)

In [66]:
sit_classification = 'long' if p1 > p2 else 'short'
sit_dict['+20min'] = p1 > p2

In [67]:
sit_dict

{'Metta': 1,
 'Mindfulness of Fingers Touching': 1,
 'Body / Grounding Awareness': 1,
 'Stillness': 1,
 'Contemplation': 1,
 'Mindfulness of Breathing': 1,
 'Doing Nothing': 1,
 '+20min': False}

In [68]:
print(f'Based on Joe\'s recently recorded meditation sits, this sit is classified/guessed to be a {sit_classification} sit.')

Based on Joe's recently recorded meditation sits, this sit is classified/guessed to be a short sit.


# App complete! Now we would like to test our model to ensure a good accuracy!

### This will either be in sklearn or via a manual derivation for DIRECT learning practice

# Other

In [None]:
x = countPracticeTypes(meditations['practice'][0])
x
df_count = pd.DataFrame(x, index=['i',])
#meditations['practice'][:1].apply(countPracticeTypes)