In [1]:
import pandas as pd
import numpy as np
import string
import spacy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans


import wordcloud

from pathlib import Path

In [2]:
data_folder = Path("data_tech/") 

file_to_open = data_folder / "OSMI_2016_kurz.csv"

In [3]:
df_2016 = pd.read_csv(file_to_open)
df_2016.columns = map(str.lower, df_2016.columns)

In [4]:
rename_dict = {'are you self-employed?': "Employment", 
              'does your employer provide mental health benefits as part of healthcare coverage?':"Ment_Benefit",
              'has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': "Ment_Discuss",
               'do you think that discussing a mental health disorder with your employer would have negative consequences?': "Ment_Consequence",
               'do you think that discussing a physical health issue with your employer would have negative consequences?': "Phys_Consequence",
               'would you feel comfortable discussing a mental health disorder with your coworkers?': "Discuss_Coworkers",
               'would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?': "Discuss_Supervisor",
               'do you feel that your employer takes mental health as seriously as physical health?': "Ment_vs_Phys",
               'have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?': "Obs_Consequence",
               'if you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?': "Reveal_Treatment",
               'do you believe your productivity is ever affected by a mental health issue?': "Productivity",
               'would you be willing to bring up a physical health issue with a potential employer in an interview?': "Interview_phys1",
               'why or why not?': "Interview_phys2",
               'would you bring up a mental health issue with a potential employer in an interview?': "Interview_psych1",
               'why or why not?.1': "Interview_psych2",
               'do you feel that being identified as a person with a mental health issue would hurt your career?': "Career_Consequence",
               'do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?': "Coworkers_view",
               'how willing would you be to share with friends and family that you have a mental illness?': "Share_friends",
               'have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?': "Obs_Response1",
               'have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?': "Obs_Response2",
               'have you had a mental health disorder in the past?': "Disorder",
               'have you ever sought treatment for a mental health issue from a mental health professional?': "Treatment",
               'if you have a mental health issue, do you feel that it interferes with your work when being treated effectively?': "Interferes1",
               'if you have a mental health issue, do you feel that it interferes with your work when not being treated effectively?': "Interferes2",
               'what is your age?': "Age", 
               'what is your gender?': "Gender",
               'what country do you live in?': "Country",
              }

In [5]:
df = df_2016.rename(columns = rename_dict, errors = "raise")

In [6]:
df = df[["Interview_phys1", "Interview_phys2", "Interview_psych1", "Interview_psych2"]]

In [7]:
# normalerweise nicht empfehlenswert, aber hier wissen wir ja wie viele fehlende Werte es gibt. 
df = df.dropna()

In [8]:
# Change sentences to lowercase
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

In [9]:
# das englische Spacy Sprach Paket müsst ihr vorher runterladen 
# https://spacy.io/usage/models
nlp = spacy.load("en_core_web_md")

In [10]:
# Tokenize Words 
df["Interview_phys_NLP"] = [nlp(s) for s in df["Interview_phys2"]]

In [11]:
df["Interview_psych_NLP"] = [nlp(s) for s in df["Interview_psych2"]]

In [12]:
df.head()

Unnamed: 0,Interview_phys1,Interview_phys2,Interview_psych1,Interview_psych2,Interview_phys_NLP,Interview_psych_NLP
1,maybe,it would depend on the health issue. if there ...,no,while mental health has become a more prominen...,"(it, would, depend, on, the, health, issue, .,...","(while, mental, health, has, become, a, more, ..."
2,yes,"they would provable need to know, to judge if ...",yes,"stigma, mainly.","(they, would, provable, need, to, know, ,, to,...","(stigma, ,, mainly, .)"
3,yes,"old back injury, doesn't cause me many issues ...",maybe,would not if i was not 100% sure that the disc...,"(old, back, injury, ,, does, n't, cause, me, m...","(would, not, if, i, was, not, 100, %, sure, th..."
4,maybe,depending on the interview stage and whether i...,no,i don't know,"(depending, on, the, interview, stage, and, wh...","(i, do, n't, know)"
5,yes,if it would potentially affect my ability to d...,maybe,it would depend on the field & what i knew of ...,"(if, it, would, potentially, affect, my, abili...","(it, would, depend, on, the, field, &, what, i..."


In [24]:
example = df["Interview_psych_NLP"][1]

In [25]:
example

while mental health has become a more prominent issue recently, i feel like there is still a lot of stigma surrounding it. at this point, with our culture, i would not bring it up. i hope that within the next 20-30 years that stigma will be gone and it can be brought up safely in an interview.

In [13]:
def spacify_my_text(text, custom_stop=None, lemmatize=True):
    ''' Loops through list of text-corpora, tokenizes and lemmatizes words and
    removes stop words and punctutations.
    Returns lemmatized list of words'''

    spacyfied = []
    original = list(nlp.Defaults.stop_words) # list of stop words
    original.append(' ') # adds space to stop word list
    for p in string.punctuation:
        original.append(p)

    if custom_stop: # adds custom stop words to basic list
        for i in custom_stop:
            if i not in original:
                original.append(i)

    parsed_sentence = nlp(text.lower())
    treated_sentence = ''

# With SpaCy we can access each word’s base form with a token’s .lemma_ method

    for token in parsed_sentence:
        if str(token) not in original:
            if lemmatize:
                treated_sentence += str(token.lemma_) + ' '
            else:
                treated_sentence += str(token) + ' '
    spacyfied.append(treated_sentence.strip())

    return spacyfied