## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Spacy
import spacy

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

# NLTK
from nltk.corpus import stopwords

# Silence Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Read-In Data

In [3]:
health = pd.read_csv('../data/womens_health_preprocessed.csv', lineterminator='\n')
print(f'Health: {health.shape}')
obsgyn = pd.read_csv('../data/fertility_and_pregnancy_preprocessed.csv')
print(f'ObsGyn: {obsgyn.shape}')
pospar = pd.read_csv('../data/postpartum_preprocessed.csv')
print(f'Pospar: {pospar.shape}')

Health: (30616, 7)
ObsGyn: (92943, 7)
Pospar: (49094, 7)


## Stop Words

In [4]:
def add_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.add(word)
        
def remove_stop_words(word_list, list_stop_words):
    for word in word_list:
        list_stop_words.remove(word)

In [5]:
# spaCy
nlp = spacy.load('en_core_web_sm')
stop_words_spacy = set(nlp.Defaults.stop_words)

# nltk
stop_words_nltk = set(stopwords.words('english'))

# full list of stop words
full_stop_words = stop_words_spacy.union(stop_words_nltk)

words_to_add = ['like', 'know', 'want', 'feel', 'going', 'think', 'reddit', 'imgur', 'pron', 'officially', 'story', 'month', 'week', 'time', 'day', 'year', 'delete', 'saturday', 'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'subreddit']

add_stop_words(words_to_add, full_stop_words)
remove_stop_words(['not'], full_stop_words)

## Create Tfidf Vectorizer

In [6]:
def build_tvec(df, column, stop_words = full_stop_words):
    tvec = TfidfVectorizer(max_df = 0.98, min_df = 0.005, ngram_range = (1,2), stop_words = stop_words)
    dtm = tvec.fit_transform(df[column])
    return tvec, dtm

In [7]:
# Vectorizer, dtm are outputed from build_tvec

health_vectorizer, health_dtm = build_tvec(health, 'lemma_text')
obsgyn_vectorizer, obsgyn_dtm = build_tvec(obsgyn, 'lemma_text')
pospar_vectorizer, pospar_dtm = build_tvec(pospar, 'lemma_text')

In [8]:
health_terms = health_vectorizer.get_feature_names()
obsgyn_terms = obsgyn_vectorizer.get_feature_names()
pospar_terms = pospar_vectorizer.get_feature_names()

## K-Means Clusters

In [9]:
def grid_search_kmeans(dtm, k):
    
    # Build KMeans Model
    kmeans = KMeans(n_clusters = k, random_state = 42, n_jobs = -1)
    kmeans.fit(dtm)
    
    # Calculate Silhouette Score
    preds = kmeans.fit_predict(dtm)
    sil_score = silhouette_score(dtm, preds)
        
    return sil_score

In [10]:
def build_kmeans(dtm, k):
    
    # Build KMeans Model
    kmeans = KMeans(n_clusters = k, random_state = 42, n_jobs = -1)
    kmeans.fit(dtm)
       
    return kmeans

In [11]:
def plot_gs_results(start, stop, step, list_of_scores, title = None, x_label = None, y_label = None):
    plt.figure(figsize = (12, 8))
    sns.lineplot(x = range(start, stop, step), y = list_of_scores, linewidth = 2, color = '#B392AC')
    plt.title(title, fontdict = {'fontsize': 15}, pad = 8)
    plt.xlabel(x_label, fontdict = {'fontsize': 12}, labelpad = 8)
    plt.ylabel(y_label, fontdict = {'fontsize': 12}, labelpad = 8)
    plt.tight_layout();

In [12]:
def get_topic_terms_kmeans(model, k, terms, n_terms = 15):
# Code modified from # https://pythonprogramminglanguage.com/kmeans-text-clustering/

    sorted_centroids = model.cluster_centers_.argsort()[:, ::-1]
    
    topic_dictionary = {}
    for i in range(k):
        topics_list = []
        for index in sorted_centroids[i, :n_terms]:
                topics_list.append(terms[index])
        topic_dictionary[i] = topics_list
    
    return topic_dictionary

### General Women's Health Data

In [14]:
health_kmeans = build_kmeans(health_dtm, 17)

In [15]:
health['kmeans_cluster'] = health_kmeans.labels_

In [16]:
health_kmeans_topics = get_topic_terms_kmeans(health_kmeans, 17, health_terms, n_terms = 15)

### Exploration of Clusters

#### Clusters Related to Health Concerns
Cluster Number| Name| Top 5 Terms
-|-|-
0| Intrauterine Devices (IUDs)|IUD, period, copper, copper IUD, Mirena
1|Late Periods/ Pregnancy|period, late, test, pregnancy, pregnant
2|Infections (Yeast and Bacterial Vaginosis)|yeast, infection, yeast infection, discharge, BV
4|Vaginal Symptoms|smell, discharge, vagina, odor, shower
5|Periods and Bleeding|bleed, period, blood, bleeding, brown
7|Gynecological Exams|pap, smear, pap smear, hpv, abnormal
9|Ovarian Cysts|cyst, ovarian, ovarian cyst, pain, ovary
10|Birth Control Pills| bc, birth, birth control, bleed, control
12| Breast Health| breast, bra, boob, nipple, lump
13|Infections (Urinary Tract Infection)|sex, uti, vagina, pee, doctor
14| Period Pain| pain, period, cramp, doctor, bad
15|Birth Control Pills| birth, birth control, control, control pill, doctor

#### Clusters Not Related to Health Concerns
Cluster Number| Name| Top 5 Terms
-|-|-
3| Hair and Depilation| hair, shave, grow, look, dye
6| Unknown| guy, work, people, job, thing
8| Unknown| help, look, try, good, woman
11| Friendship| friend, good friend, talk, people, guy
12| Clothing and Appearance| wear, dress, look, jean, makeup


### Fertility and Pregnancy Data

In [19]:
obsgyn_kmeans = build_kmeans(obsgyn_dtm, 15)

In [20]:
obsgyn_kmeans_topics = get_topic_terms_kmeans(obsgyn_kmeans, 15, obsgyn_terms)

<h4 align = center> Clusters Related to Health Concerns </h4>

Cluster Number| Name| Top 5 Terms
-|-|-
0| Sleep| pain, sleep, night, bad, wake
2| Ovulation| cycle, CD (cycle day), OPK (ovulation predictor kit), ovulation, ovulate
3| Pregnancy Tests| test, positive, pregnancy test, negative, pregnancy
4| Weight During Pregnancy| eat, weight, food, gain, pregnancy
5| Obstetrics Appointments| ultrasound, doctor, appointment, scan, blood
6| Period and Pregnancy| period, test, start, pregnant, late
9| Fetal Movement| kick, movement, baby, baby kick, little
10| Labor| labor, contraction, hour, epidural, CM (centimeters)
14| Pregnancy Status| pregnant, pregnancy, find, try, find pregnant

<h4 align = center> Clusters Not Related to Health Concerns </h4>

Cluster Number| Name| Top 5 Terms
-|-|-
1| Unknown| pregnancy, help, try, look, start
7| Baby Shower| shower, baby shower, baby, gift, friend
8| Work and Pregnancy| work, job, leave, home, maternity
11| Purchases for Baby| baby, buy, need, thing, seat
12| Gender Reveal| boy, girl, gender, baby, find
13| Pregnancy Announcement| tell, family, baby, husband, friend

### Pospartum Data

In [13]:
pospar_kmeans_10 = build_kmeans(pospar_dtm, 10)

In [14]:
pospar_kmeans_topics_14 = get_topic_terms_kmeans(pospar_kmeans_10, 10, pospar_terms)

#### Clusters Related to Health Concerns
Cluster Number| Name| Top 5 Terms
-|-|-
0| Breastfeeding(Latching)| nipple, latch, shield, nipple shield, feed
2| Obstetrics Appointments| ultrasound, doctor, baby, scan, appointment
3| Pain During Pregnancy| pain, bad, hurt, experience, help
4| Baby's Sleep Schedule| sleep, night, wake, hour, feed
5| Pregnancy Experience| pregnancy, pregnant, eat, weight, gain
6| Breastfeeding (Pumping)| pump, supply, work, oz (ounces), breast
7| Pregnancy Tests| test, period, positive, pregnancy, pregnancy test
8| Breastfeeding (Random)| milk, feed, bottle, breast, pump
9| Breastfeeding (Random)| breastfeed, nurse, wean, old, nursing
10| Labor| contraction, labor, CM (centimeters), epidural, push



#### Clusters Not Related to Health Concerns
Cluster Number| Name| Top 5 Terms
-|-|-
1| Unknown| baby, birth, help, look, try
11| Maternity Wear| maternity, bra, wear, clothe, nursing
12| Pregnancy Announcement| baby, tell, husband, family, work
13| Baby Shower| baby, shower, baby shower, buy, registry

## Hierarchical Clusters

#### Reduce Size of Data

In [9]:
health_sample_1, health_sample_2 = train_test_split(health_dtm, test_size = 0.4)
obsgyn_sample_1, obsgyn_sample_2 = train_test_split(obsgyn_dtm, test_size = 0.4)
pospar_sample_1, pospar_sample_2 = train_test_split(pospar_dtm, test_size = 0.4)

In [10]:
obsgyn_sample_1.shape

(55765, 1565)

In [12]:
# for clustering
from scipy.cluster.hierarchy import ward, dendrogram, fcluster, single, complete
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score

In [15]:
health_dist = 1 - cosine_similarity(health_sample_1)

In [14]:
pospar_dist = 1 - cosine_similarity(pospar_sample_1)

In [13]:
obsgyn_dist = 1 - cosine_similarity(obsgyn_sample_1)

In [16]:
health_dist -= health_dist.min()
pospar_dist -= pospar_dist.min()
obsgyn_dist -= obsgyn_dist.min()

In [17]:
health_link = ward(health_dist)

In [None]:
pospar_dist = ward(pospar_dist)

In [None]:
obsgyn_dist = ward(obsgyn_dist)

### General Women's Health