In [2]:
import pandas as pd 
import numpy as np
from helpers import *
#import coach_dicts
import os

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, balanced_accuracy_score, precision_score, precision_recall_curve
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

# Display plots inside the notebook
%matplotlib inline

# Ignore warning related to pandas_profiling
import warnings
warnings.filterwarnings('ignore') 

from pathlib import Path

# Display all dataframe columns in outputs (it has 63 columns, which is wider than the notebook)
# This sets it up to display with a horizontal scroll instead of hiding the middle columns
pd.set_option('display.max_columns', 800) 
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1800)

In [None]:
path = os.environ.get("DATA_PATH")

In [None]:
data = "tkdc_leona_2021-03-01.csv"

In [None]:
df = pd.read_csv(path + data, sep= ",", low_memory = False)

In [None]:
df_letter1 = df[["id", "letter_M1"]]

In [None]:
df_letter2 = df[["id", "letter_M2"]]

In [None]:
df_letter1.set_index("id", inplace=True)

In [None]:
df_letter2.set_index("id", inplace=True)

### Import dictionaries

In [None]:
wortdict_1 = pd.read_csv("Words_SentiArt.csv", sep=";")

In [3]:
wortdict_2 = pd.read_csv("LIWC.csv", sep=";")
wortdict_2

Unnamed: 0,(:,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,(;,,,,,,,,,,,,
1,):,,,,,,,,,,,,
2,/:,,,,,,,,,,,,
3,4ev*,,,,,,,,,,,,
4,:(,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18705,übriges,,,,,,,,,,,,
18706,übst,,,,,,,,,,,,
18707,übte,,,,,,,,,,,,
18708,übung*,,,,,,,,,,,,


In [None]:
wortdict_2 = wortdict_2.iloc[:,0]

In [None]:
import re
wordlist_2 = []
for word in wortdict_2:
    word = str(word)
    if "*" in word:
        wordlist_2.append(word[:-1])
    else:
        wordlist_2.append(word)

In [None]:
import re
from happierfuntokenizing import emoticon_string, regex_strings
from unicode_codes import EMOTICON_REGEXES, EMOJI_UNICODE, EMOJI_ALIAS_UNICODE, UNICODE_EMOJI

In [None]:
emoticon_regexes = {k.lower(): re.compile(v) for k, v in EMOTICON_REGEXES.items()}


def removeNumeric(token):
    # remove phone numbers
    token = re.sub(r'((1-\d{3}-\d{3}-\d{4})|(1 \d{3} \d{3} \d{4})|(\d{3} \d{3} \d{4})|(\(\d{3}\) \d{3}-\d{4})|(\d{3}-\d{3}-\d{4}))$', '', token)
    # remove all numeric words
    token = '' if token.isdigit() else token
    # remove words with 2 successive digits
    token = '' if re.search(r'\d{2}', token) else token
    return token

def isEmojiOrEmoticon(token):
    if any([v.match(token) for k,v in emoticon_regexes.items()]) or token in UNICODE_EMOJI:
        return True
    else:
        return False

In [None]:
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)

In [None]:
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)

In [None]:
class Tokenizer:
    def __init__(self, preserve_case=False, use_unicode=True):
        self.preserve_case = preserve_case
        self.use_unicode = use_unicode

    def tokenize(self, s):
        """
        Argument: s -- any string or unicode object
        Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
        """        
        # Try to ensure unicode:
        if self.use_unicode:
            try:
                s = str(s)
            except UnicodeDecodeError:
                s = str(s).encode('string_escape')
                s = str(s)
        # Fix HTML character entitites:
        #s = self.__html2unicode(s)
        #s = self.__removeHex(s)
        # Tokenize:
        words = word_re.findall(s)
        #print words #debug
        # Possible alter the case, but avoid changing emoticons like :D into :d:
        if not self.preserve_case:            
            words = list(map((lambda x : x if emoticon_re.search(x) else x.lower()), words))
        
        return words

In [None]:
tokenizer = Tokenizer(preserve_case=False)

In [None]:
multSpace = re.compile(r'\s\s+')
startSpace = re.compile(r'^\s+')
endSpace = re.compile(r'\s+$')
multDots = re.compile(r'\.\.\.\.\.+') #more than four periods
newlines = re.compile(r'\s*\n\s*')

def shrinkSpace(s):
    """turns multiple spaces into 1"""
    s = multSpace.sub(' ',s)
    s = multDots.sub('....',s)
    s = endSpace.sub('',s)
    s = startSpace.sub('',s)
    s = newlines.sub(' <NEWLINE> ',s)
    return s

In [None]:
for letter in df_letter1.columns[0]:
    letter = re.split(" ", letter)

In [None]:
df_letter1 = df_letter1.applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
cleaned_list_1 = []
for letter in df_letter1["letter_M1"]:
    cleaned_words = []
    if isinstance(letter, str):
        cleaned_token = shrinkSpace(letter.strip())
        words = tokenizer.tokenize(cleaned_token)

        for w in words:
            if removeNumeric(w) and (isEmojiOrEmoticon(w)or w.lower() in wordlist_2):
                cleaned_words.append(w)

    cleaned_list_1.append(cleaned_words)        

In [None]:
df_letter1["letter_M1"] = cleaned_list_1

In [None]:
df_letter2 = df_letter2.applymap(lambda s:s.lower() if type(s) == str else s)

In [None]:
for letter in df_letter2.columns[0]:
    letter = re.split(" ", letter)

In [None]:
cleaned_list_2 = []
for letter in df_letter2["letter_M2"]:
    cleaned_words = []
    if isinstance(letter, str):
        cleaned_token = shrinkSpace(letter.strip())
        words = tokenizer.tokenize(cleaned_token)

        for w in words:
            if removeNumeric(w) and (isEmojiOrEmoticon(w)or w.lower() in wordlist_2):
                cleaned_words.append(w)

    cleaned_list_2.append(cleaned_words)  

In [None]:
df_letter2["letter_M2"] = cleaned_list_2

In [None]:
#filename = path + "letter1_cleaned.csv"
#df_letter1.to_csv(filename, encoding="utf-8", index=True)

In [None]:
#filename = path + "letter2_cleaned.csv"
#df_letter2.to_csv(filename, encoding="utf-8", index=True)

In [None]:
df["letter_M1"] = cleaned_list_1

In [None]:
df["letter_M2"] = cleaned_list_2

In [None]:
df.head()

In [None]:
column_zwei = ['registration','studyVariant','coach','PRE_bdi1','PRE_bdi2',
               'PRE_bdi3','PRE_bdi4','PRE_bdi5','PRE_bdi6','PRE_bdi7','PRE_bdi8','PRE_bdi9','PRE_bdi10',
               'PRE_bdi11','PRE_bdi12','PRE_bdi13','PRE_bdi14','PRE_bdi15','PRE_bdi16','PRE_bdi17','PRE_bdi18',
               'PRE_bdi19','PRE_bdi20','PRE_bdi21','POST_phqD1','POST_phqD2','POST_phqD3','POST_phqD4','POST_phqD5',
               'POST_phqD6','POST_phqD7','POST_phqD8','POST_phqD9',
               'PRE_phqS1','PRE_phqS2','PRE_phqS3','PRE_phqS4','PRE_phqS5','PRE_phqS6','PRE_phqS7',
               'PRE_phqS8','PRE_phqS9','PRE_phqS10','PRE_phqD1','PRE_phqD2','PRE_phqD3','PRE_phqD4','PRE_phqD5',
               'PRE_phqD6','PRE_phqD7','PRE_phqD8','PRE_phqD9', 'PRE_birth','PRE_sex','PRE_education',
               'PRE_work','PRE_household','PRE_relation','PRE_residence','PRE_internet','PRE_height','PRE_weight',
               'PRE_treatment','PRE_support','PRE_kPT','PRE_ill','PRE_sickleave','PRE_doc',
               'PRE_neurol','PRE_selfhelp','PRE_counsel','PRE_therapy','PRE_med','PRE_hospital',
            'PRE_eurohis1','PRE_eurohis2','PRE_eurohis3','PRE_eurohis4','PRE_eurohis5',
               'PRE_eurohis6','PRE_eurohis7','PRE_eurohis8', 'TI_score','TI_bip1',
               'TI_bip2','TI_MDE','TI_dyst','TI_F25','TI_F22','TI_F23',
               'TI_F29','TI_MDE_vr','TI_MDE_tr','TI_HYP_vr','TI_MAN_vr','TI_medik','TI_rekrut','PRE_gad1',
               'PRE_gad2','PRE_gad3','PRE_gad4','PRE_gad5','PRE_gad6','PRE_gad7', 'PRE_costa1', 'PRE_costa2', 'PRE_costa3',
              'PRE_costa4', 'PRE_costa5', 'PRE_costa6', 'PRE_costa7', 'PRE_costa8', 'PRE_costa9', 'PRE_costa10', 'PRE_costa11',
              'PRE_costa12', 'PRE_costa13', 'PRE_costa14', 'PRE_costa15', 'PRE_costa16', 'PRE_costa17', 'PRE_costa18',
              'PRE_costa19', 'PRE_costa20', 'PRE_costa21', 'PRE_pathev1', 'PRE_pathev2', 'PRE_pathev3', 'PRE_pathev4',
              'PRE_pathev5', 'PRE_pathev6', 'PRE_pathev7', 'PRE_pathev8', 'PRE_pathev9', 'PRE_pathev10', 
              'PRE_euheals1','PRE_euheals2','PRE_euheals3','PRE_ipqr1','PRE_ipqr2','PRE_ipqr3','PRE_ipqr4','PRE_ipqr5',
               'PRE_ipqr6','PRE_ipqr7','PRE_ipqr8','PRE_ipqr9','PRE_ipqr10','PRE_ipqr11','PRE_ipqr12','PRE_ipqr13',
               'PRE_ipqr14','PRE_ipqr15','PRE_ipqr16','PRE_ipqr17','PRE_ipqr18', 'PRE_bsss1','PRE_bsss2','PRE_bsss3',
               'PRE_bsss4','PRE_bsss5','PRE_bsss6','PRE_bsss7','PRE_bsss8','PRE_bsss9','PRE_bsss10','PRE_bsss11',
               'PRE_bsss12','PRE_bsss13','PRE_gpse1','PRE_gpse2','PRE_gpse3','PRE_gpse4','PRE_gpse5','PRE_gpse6',
               'PRE_gpse7','PRE_gpse8','PRE_gpse9','PRE_gpse10','PRE_pvq1','PRE_pvq2','PRE_pvq3','PRE_pvq4','PRE_pvq5',
               'PRE_pvq6','PRE_pvq7','PRE_pvq8','PRE_pvq9','PRE_pvq10','PRE_pvq11','PRE_pvq12','PRE_pvq13','PRE_pvq14',
               'PRE_pvq15','PRE_pvq16','PRE_pvq17','PRE_pvq18','PRE_pvq19','PRE_pvq20','PRE_pvq21', 'PRE_imet1','PRE_imet2','PRE_imet3','PRE_imet4','PRE_imet5','PRE_imet6','PRE_imet7',
                'PRE_imet8','PRE_imet9','PRE_imet10', 'M1_phqD1','M1_phqD2','M1_phqD3','M1_phqD4','M1_phqD5','M1_phqD6',
               'M1_phqD7','M1_phqD8','M1_phqD9', 'letter_M1', 'letter_M2']

In [None]:
df_short = df[column_zwei]

In [None]:
df_short.head()

In [None]:
df_short["letter_M1"] = df_short["letter_M1"].apply(lambda x:np.nan if len(x) ==0 else x)

In [None]:
exclusion_cols = ["TI_F29", "TI_F23", "TI_F22", "TI_F25"]
df_short.drop(exclusion_cols, axis=1, inplace=True)

In [None]:
df_short["letter_M2"] = df_short["letter_M2"].apply(lambda x:np.nan if len(x) ==0 else x)

In [None]:
df_short = df_short[df_short[["letter_M1", "letter_M2"]].notnull().all(axis=1)]

In [None]:
df_short.shape

In [None]:
df.shape