## Preprocessing

In [None]:
import pandas as pd

In [None]:
survey = pd.read_csv("./Archive/Survey.csv")

In [None]:
survey = survey[~survey["Patient ID#"].isnull()]

In [None]:
survey = survey[
    [
        "Patient ID#",
        "Conservation",
        "Conformity",
        "Tradition",
        "Security",
        "Self-Transcendance",
        "Benevolence",
        "Universalism",
        "Self-Enhancement",
        "Power",
        "Achievement",
        "Stimulation",
        "Openness to Change",
        "Hedonism",
        "Self-Direction",
    ]
]

In [None]:
import glob

In [None]:
dataframe = pd.DataFrame()
for file in glob.glob(r"Archive\two_speakers_ID\*.csv"):
    dataframe = pd.concat([dataframe, pd.read_csv(file)])

In [None]:
dataframe = dataframe[dataframe["Role"] == "Patient"]

In [None]:
dataframe["ID"] = dataframe["ID"].astype("int64")

In [None]:
dataframe = (
    dataframe.groupby("ID")["Message"].apply(lambda msgs: " ".join(msgs)).reset_index()
)

In [None]:
survey["Patient ID#"] = survey["Patient ID#"].astype("int64")

In [None]:
merged = pd.merge(survey, dataframe, left_on="Patient ID#", right_on="ID")

In [None]:
merged = merged.drop(["Patient ID#"], axis=1)

In [None]:
merged.to_excel("combined.xlsx", index=False)

## Embeddings

In [None]:
!pip install pandas openpyxl nltk spacy
!python -m spacy download en_core_web_sm

In [1]:
import pandas as pd

dataset = pd.read_excel("combined.xlsx")

In [2]:
dataset = dataset[~dataset["Conformity"].isnull()]

In [3]:
import nltk
import spacy
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("words")
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
def process_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    return list(set(lemmatized_tokens))

In [5]:
bag_of_words = []
for message in dataset["Message"]:
    bag_of_words += process_text(message)

In [6]:
bag_of_words = list(set(bag_of_words))

In [7]:
len(bag_of_words)

6801

In [8]:
X = []
y = []

for index, row in dataset.iterrows():

    encoding_vector = [0] * len(bag_of_words)

    for token in process_text(row["Message"]):

        if token in bag_of_words:

            index = bag_of_words.index(token)

            encoding_vector[index] += 1

    X.append(encoding_vector)

    y.append(
        list(
            row[
                [
                    "Conservation",
                    "Conformity",
                    "Tradition",
                    "Security",
                    "Self-Transcendance",
                    "Benevolence",
                    "Universalism",
                    "Self-Enhancement",
                    "Power",
                    "Achievement",
                    "Stimulation",
                    "Openness to Change",
                    "Hedonism",
                    "Self-Direction",
                ]
            ]
        )
    )

In [9]:
import numpy as np

In [10]:
X = np.array(X)

In [11]:
y = np.array(y)

In [12]:
X.shape

(112, 6801)

In [13]:
y.shape

(112, 14)

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Define the model
model = Sequential([
    Dense(1024, input_dim=6801, activation='relu'),  # Input layer
    Dropout(0.5),  # Dropout for regularization
    # Dense(256, activation='relu'),  # Hidden layer
    # Dense(128, activation='relu'),  # Hidden layer
    # Dense(64, activation='relu'),  # Hidden layer
    # Dense(32, activation='relu'),  # Hidden layer
    Dense(14, activation='linear')  # Output layer (14 targets)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
history = model.fit(X, y, epochs=500, batch_size=64, validation_split=0.2)

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 253ms/step - loss: 32.7755 - mse: 32.7755 - val_loss: 18.5242 - val_mse: 18.5242
Epoch 2/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - loss: 11.4085 - mse: 11.4085 - val_loss: 12.4237 - val_mse: 12.4237
Epoch 3/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - loss: 12.3858 - mse: 12.3858 - val_loss: 11.8756 - val_mse: 11.8756
Epoch 4/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - loss: 9.1006 - mse: 9.1006 - val_loss: 12.9525 - val_mse: 12.9525
Epoch 5/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - loss: 6.0449 - mse: 6.0449 - val_loss: 14.6629 - val_mse: 14.6629
Epoch 6/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - loss: 6.1042 - mse: 6.1042 - val_loss: 14.4513 - val_mse: 14.4513
Epoch 7/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - l

In [40]:
input_vector = [0] * len(bag_of_words)
for token in process_text(dataset["Message"][3]):
    if token in bag_of_words:
        index = bag_of_words.index(token)
        input_vector[index] += 1

In [41]:
model.predict(np.array([input_vector]))[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step


array([7.8551393, 7.8407826, 7.5132203, 7.9932747, 7.6798844, 7.9065366,
       7.619238 , 7.607812 , 7.4186025, 7.8356323, 8.22707  , 6.518433 ,
       5.096437 , 6.571734 ], dtype=float32)

### [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.333333333, 5.0, 6.0]

In [20]:
list(
    dataset[
        [
            "Conservation",
            "Conformity",
            "Tradition",
            "Security",
            "Self-Transcendance",
            "Benevolence",
            "Universalism",
            "Self-Enhancement",
            "Power",
            "Achievement",
            "Stimulation",
            "Openness to Change",
            "Hedonism",
            "Self-Direction",
        ]
    ].values[3]
)

[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.333333333, 5.0, 6.0]

In [39]:
# Changes by Waqar
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Define the model
model = Sequential([
    Dense(1024, input_dim=6801, activation='relu', kernel_regularizer=l2(0.01)),  # Input layer
    Dropout(0.5),  # Dropout for regularization
    Dense(512, activation='relu'),  # Hidden layer
    # Dense(128, activation='relu'),  # Hidden layer
    # Dense(64, activation='relu'),  # Hidden layer
    # Dense(32, activation='relu'),  # Hidden layer
    Dense(14, activation='linear')  # Output layer (14 targets)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
history = model.fit(X, y, epochs=500, batch_size=32, validation_split=0.2)

Epoch 1/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 261ms/step - loss: 45.2495 - mse: 28.2355 - val_loss: 26.6775 - val_mse: 12.3984
Epoch 2/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - loss: 26.3264 - mse: 12.5377 - val_loss: 24.7014 - val_mse: 12.7207
Epoch 3/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 123ms/step - loss: 18.2331 - mse: 6.6028 - val_loss: 26.5709 - val_mse: 16.2542
Epoch 4/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - loss: 16.7376 - mse: 6.6784 - val_loss: 21.4543 - val_mse: 12.3599
Epoch 5/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 209ms/step - loss: 13.5797 - mse: 4.6752 - val_loss: 16.9367 - val_mse: 8.7488
Epoch 6/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - loss: 12.3757 - mse: 4.3309 - val_loss: 17.3374 - val_mse: 9.8356
Epoch 7/500
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13