In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import string
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from langdetect import detect
import re

Setup

In [46]:
plt.style.use('bmh')
nltk.download('stopwords')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/janramdohr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Survey data processing

In [47]:
data = pd.read_csv('../data/original/survey/survey.csv', parse_dates=['Start Date'], usecols = ['Start Date','introduction','floor','ground-floor','1-floor','2-floor','emotion','comfort','space usage','occupation','usage'])

convert survey data to correct timezone

In [48]:
data = data.loc[data['introduction'] == 'I understand, and I agree to participate in the survey']

data['Start Date'] = data['Start Date'].dt.tz_localize('America/Denver')
data['Start Date'] = data['Start Date'].dt.tz_convert('Europe/Amsterdam')

def assign_place_id(row):
    if row['floor'] == 'Ground floor' and row['ground-floor'] == 'Round tables by the three plants (across wooden staircase)':
        return 1
    elif row['floor'] == 'Ground floor' and row['ground-floor'] == 'Study corner next to the plant wall':
        return 2
    elif row['floor'] == '1st Floor' and row['1-floor'] == 'Green group study tables (between a row of plants and railing)':
        return 3
    elif row['floor'] == '1st Floor' and row['1-floor'] == 'Tables on the landing (with wooden floor) accessible by the black staircase':
        return 3
    elif row['floor'] == '1st Floor' and row['1-floor'] == 'Yellow/white chairs & tables (besides the wooden staircase)':
        return 4
    else:
        return np.nan

data['place_id'] = data.apply(assign_place_id, axis=1)

data = data.drop(columns=['floor','ground-floor','1-floor','2-floor','introduction'])
data = data.rename(columns={'usage':'frequent_use','occupation':'isStudent','space usage':'activity'})

data['frequent_use'] = data['frequent_use'].map(dict(Yes=1, No=0))
data['isStudent'] = data['isStudent'].map(dict(Yes=1, No=0))

data['emotion'] = data['emotion'].astype(str)
data['comfort'] = data['comfort'].astype(str)

detect answers not written in english

In [49]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

data['language_emotion'] = data['emotion'].apply(detect_language)   
data['language_comfort'] = data['comfort'].apply(detect_language)

non_english = data[(data['language_emotion'] != 'en') & (data['language_comfort'] != 'en')]
non_english

Unnamed: 0,Start Date,emotion,comfort,activity,isStudent,frequent_use,place_id,language_emotion,language_comfort
2,2023-05-03 13:22:43+02:00,Rustig,8/10,Learning / Working in a group,0.0,0.0,1.0,af,unknown
18,2023-05-09 11:21:03+02:00,Relaxed,Quite comfortable,Taking a break,1.0,1.0,4.0,so,fr
26,2023-05-09 14:21:39+02:00,Rustgevend mooi uitzicht,Wel redelijk veel geluid,Learning / Working by yourself,1.0,1.0,3.0,nl,nl
32,2023-05-03 13:12:22+02:00,,,,,,3.0,tl,tl
42,2023-05-11 11:32:21+02:00,"Kalm, rustig","Wel goed, lekker rustig",Learning / Working in a group,1.0,1.0,4.0,et,af
56,2023-05-15 14:54:11+02:00,Calm,,Learning / Working in a group,1.0,1.0,4.0,ca,tl
65,2023-05-09 14:51:01+02:00,,,,,,1.0,tl,tl
71,2023-05-10 10:17:34+02:00,,,,,,3.0,tl,tl
88,2023-05-16 10:29:32+02:00,,,,,,,tl,tl
95,2023-05-17 12:03:22+02:00,,,,,,4.0,tl,tl


Translations based on deepl.com

In [50]:
data.loc[2, 'emotion'] = 'Quiet'
data.loc[26, 'emotion'] = 'Soothingly beautiful view'
data.loc[26, 'comfort'] = 'Quite a lot of noise though'
data.loc[42, 'emotion'] = 'Calm, quiet'
data.loc[42, 'comfort'] = 'All right, nice and quiet'

Tokenize words and stencences and stem words for further analysis

In [51]:
def stem_words(words):
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(word) for word in words]

data['emotion_tokens'] = data['emotion'].apply(nltk.word_tokenize)
data['comfort_tokens'] = data['comfort'].apply(nltk.word_tokenize)

data['emotion_tokens_stem'] = data['emotion_tokens'].apply(stem_words)
data['comfort_tokens_stem'] = data['comfort_tokens'].apply(stem_words)


In [52]:
stemmed_words_comfort = [word for sublist in data['comfort_tokens_stem'] for word in sublist]
stemmed_words_emotion = [word for sublist in data['emotion_tokens_stem'] for word in sublist]

stop_words = set(nltk.corpus.stopwords.words('english'))
stemmed_words_comfort= [word for word in stemmed_words_comfort if word not in stop_words]
stemmed_words_emotion= [word for word in stemmed_words_emotion if word not in stop_words]

punctuation = string.punctuation + '’'

stemmed_words_comfort = [word for word in stemmed_words_comfort if word not in punctuation]
stemmed_words_emotion = [word for word in stemmed_words_emotion if word not in punctuation]

fdist_comfort = nltk.probability.FreqDist(stemmed_words_comfort)
fdist_emotion = nltk.probability.FreqDist(stemmed_words_emotion)

em_dict = dict(fdist_emotion)
com_dict = dict(fdist_comfort)
df_em = df = pd.DataFrame.from_dict(em_dict, orient='index', columns=['Frequency'])
df_com = df = pd.DataFrame.from_dict(com_dict, orient='index', columns=['Frequency'])
df_em = df_em.sort_values(by='Frequency', ascending=False)
df_com = df_com.sort_values(by='Frequency', ascending=False)
df_com.to_csv('../data/processed/survey/freq_dist_com.csv', index_label='Word')
df_em.to_csv('../data/processed/survey/freq_dist_em.csv', index_label='Word')

## Bert

In [53]:
validation_rows = data.sample(n=15, random_state=26)

In [54]:
validation_rows[['emotion','comfort']].to_csv('../data/processed/survey/validation.csv', index=False)

In [55]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

In [56]:
def score_sentiment(sentences):
    sentiment_scores = []

    for sentence in sentences:
        if sentence =='nan':
            sentiment_scores.append('nan')
        else:
            # tokenize the sentence and return tensors
            inputs = tokenizer(sentence, return_tensors='pt')

            # get the model outputs
            outputs = model(**inputs)

            # the model returns the logits
            # the sentiment score can be computed using a softmax function
            sentiment = torch.nn.functional.softmax(outputs.logits, dim=1)

            labels = ['negative', 'neutral', 'positive']
            sentiment_label = labels[sentiment.argmax(dim=1).item()]

            sentiment_scores.append(sentiment_label)
    return sentiment_scores

# add the sentiment scores to your dataframe
data['comfort_sentiment'] = score_sentiment(data['comfort'])
data['emotion_sentiment'] = score_sentiment(data['emotion'])

In [57]:
validation_rows.rename(columns={'comfort':'comfort_sentiment_manual','emotion':'emotion_sentiment_manual'}, inplace=True)

In [61]:
# Merge the dataframes based on 'emotion' column
merged_df = data.merge(validation_data, how='inner', on='emotion')

# Check where emotion_sentiment != emotion_sentiment_manual or comfort_sentiment != manual_emotion_sentiment
mismatch_df = merged_df.loc[(merged_df['emotion_sentiment'] != merged_df['emotion_sentiment_manual']) | (merged_df['comfort_sentiment'] != merged_df['comfort_sentiment_manual'])]

# Print the rows where values do not match
mismatch_df[['emotion','emotion_sentiment','emotion_sentiment_manual','comfort_x','comfort_sentiment','comfort_sentiment_manual']]

Unnamed: 0,emotion,emotion_sentiment,emotion_sentiment_manual,comfort_x,comfort_sentiment,comfort_sentiment_manual
8,"It is a nice place, with nice people, but I am very stressed",positive,positive,,,
10,Neutral; pretty okay but nothing special,neutral,neutral,"Fairly comfortable, nothing's explicitly wrong and the vibe is nice, but nothing stands out either",positive,neutral


# Aspect based sentiment analysis

create environment parameter related list

In [24]:
com_labels = pd.read_csv('../data/processed/survey/freq_dist_com_labeled.csv')
em_labels = pd.read_csv('../data/processed/survey/freq_dist_em_labeled.csv')
labels = ['light', 'temperature', 'sound', 'air', 'furniture', 'plants']
label_word_arrays_em = {}
label_word_arrays_com = {}

# For each label, filter the dataframe and convert the 'Word' column to an array/list
for label in labels:
    label_word_arrays_em[label] = em_labels[em_labels['Label'] == label]['Word'].to_list()

for label in labels:
    label_word_arrays_com[label] = com_labels[com_labels['Label'] == label]['Word'].to_list()

# Print the result
for label, words in label_word_arrays_em.items():
    print(f'{label}: {words}')

for label, words in label_word_arrays_com.items():
    print(f'{label}: {words}')

light: ['light', 'illumin']
temperature: ['cold']
sound: ['quiet', 'nois', 'noisi', 'silenc']
air: ['air']
furniture: ['chair', 'screen', 'tabl', 'chairs/tabl']
plants: ['plant', 'green', 'natur', 'greeneri']
light: ['light', 'bright', 'dark', 'well-lit', 'illumin', 'sunlight', 'dim']
temperature: ['temperatur', 'cold', 'warm', 'drafti', 'chilli', 'warmer', 'breez']
sound: ['nois', 'noisi', 'loud', 'talk', 'sound', 'quiet', 'noic', 'echo-i', 'silenc', 'lout', 'noisey', 'silent', 'rumbl', 'voic']
air: ['air', 'smell', 'fresh', 'breathabl']
furniture: ['chair', 'tabl', 'coffe', 'seat', 'toilet', 'machin', 'stool', 'sofa', 'screen', 'furnitur']
plants: ['plant', 'green', 'greeneri', 'natur', 'tree']


In [25]:
ps = nltk.stem.PorterStemmer()

def custom_split(text):
    first_split = re.split('\. |, |\! | but ', text)

    final_split = []

    for sentence in first_split:
        if ' and ' in sentence:
            subsentences = sentence.split(' and ')
            if any(len(subsentence.split()) <= 2 for subsentence in subsentences):
                final_split.append(sentence)
            else:
                final_split.extend(subsentences)
        else:
            final_split.append(sentence)
    
    return final_split

def stem_sentence(sentence):
    words = nltk.tokenize.word_tokenize(sentence)
    stemmed_words = [ps.stem(word) for word in words]
    return stemmed_words

def split_and_group(text, params):
    split_sentences = custom_split(text)
    
    param_sentences = {param: [] for param in params.keys()}
    
    for subsentence in split_sentences:
        stemmed_sentence = stem_sentence(subsentence)
        for param, stems in params.items():
            found_stem = False
            for stem in stems:
                if stem in stemmed_sentence:
                    found_stem = True
                    break  
            
            # Check if any stem was found
            if found_stem:
                param_sentences[param].append(subsentence.strip())
    
    return param_sentences

grouped_sentences_comfort = data['comfort'].apply((lambda x: split_and_group(x,label_word_arrays_com)))
grouped_sentences_emotion = data['emotion'].apply((lambda x: split_and_group(x,label_word_arrays_em)))

for param in label_word_arrays_com.keys():
    data['comfort_' + param] = grouped_sentences_comfort.apply(lambda x: x[param])

for param in label_word_arrays_em.keys():
    data['emotion_' + param] = grouped_sentences_emotion.apply(lambda x: x[param])


run sentiment analysis for all newly created columns

In [26]:
columns = ['comfort_sound','comfort_light','comfort_furniture','comfort_plants','comfort_air','comfort_temperature','emotion_sound','emotion_light','emotion_furniture','emotion_plants','emotion_air','emotion_temperature']

for column in columns:
    joined = data[column].apply(lambda x: ', '.join(x)if x else 'nan')
    data[column+ '_sentiment'] = score_sentiment(joined)


Clean up helper columns and export data

In [27]:
data =data.drop(columns=['language_emotion',
                   'language_comfort',
                   'comfort_temperature',
                   'comfort_sound',
                   'comfort_air',
                   'comfort_furniture',
                   'comfort_plants',
                   'comfort_light',
                   'emotion_temperature',
                   'emotion_sound',
                   'emotion_air',
                   'emotion_furniture',
                   'emotion_plants',
                   'emotion_light'])

data = data.replace('nan', None)
data.to_csv('../data/processed/survey/survey_sentiment.csv',na_rep='', index=False, )