## Personality Trait Prediction

In [10]:
# !pip install nltk
# !pip install spacy
# !pip install pandas
# !pip install tensorflow
# !pip install scikit-learn
# !pip install openpyxl

# !python -m spacy download en_core_web_sm



Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [4]:
import glob
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("words")
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Waqar\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Preprocessing

In [11]:
survey = pd.read_csv("../Archive/Survey (45).csv")
survey = survey[~survey["Patient ID#"].isnull()]
survey = survey[
    [
        "Patient ID#",
        "Conservation",
        "Conformity",
        "Tradition",
        "Security",
        "Self-Transcendance",
        "Benevolence",
        "Universalism",
        "Self-Enhancement",
        "Power",
        "Achievement",
        "Stimulation",
        "Openness to Change",
        "Hedonism",
        "Self-Direction",
    ]
]

dataframe = pd.DataFrame()
for file in glob.glob(r"..\Archive\two_speakers_ID\*.csv"):
    dataframe = pd.concat([dataframe, pd.read_csv(file)])

dataframe = dataframe[dataframe["Role"] == "Patient"]
dataframe["ID"] = dataframe["ID"].astype("int64")
dataframe = (
    dataframe.groupby("ID")["Message"].apply(lambda msgs: " ".join(msgs)).reset_index()
    
)
survey["Patient ID#"] = survey["Patient ID#"].str.replace(',', '').astype("int64")
merged = pd.merge(survey, dataframe, left_on="Patient ID#", right_on="ID")
merged = merged.drop(["Patient ID#"], axis=1)
merged.to_excel("../combined.xlsx", index=False)

## Loading Embeddings

In [13]:
dataset = pd.read_excel("../combined.xlsx")

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658 entries, 0 to 657
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Conservation        658 non-null    float64
 1   Conformity          112 non-null    float64
 2   Tradition           112 non-null    float64
 3   Security            112 non-null    float64
 4   Self-Transcendance  658 non-null    float64
 5   Benevolence         112 non-null    float64
 6   Universalism        112 non-null    float64
 7   Self-Enhancement    658 non-null    float64
 8   Power               112 non-null    float64
 9   Achievement         112 non-null    float64
 10  Stimulation         112 non-null    float64
 11  Openness to Change  658 non-null    float64
 12  Hedonism            112 non-null    float64
 13  Self-Direction      112 non-null    float64
 14  ID                  658 non-null    int64  
 15  Message             658 non-null    object 
dtypes: float

In [15]:
dataset = dataset[~dataset["Conformity"].isnull()]

In [16]:
dataset.columns

Index(['Conservation', 'Conformity', 'Tradition', 'Security',
       'Self-Transcendance', 'Benevolence', 'Universalism', 'Self-Enhancement',
       'Power', 'Achievement', 'Stimulation', 'Openness to Change', 'Hedonism',
       'Self-Direction', 'ID', 'Message'],
      dtype='object')

In [7]:
# pip install nltk spacy
# python -m spacy download en_core_web_sm

In [17]:
def process_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    return list(set(lemmatized_tokens))

In [18]:
bag_of_words = []
for message in dataset["Message"]:
    bag_of_words += process_text(message)

In [19]:
bag_of_words = list(set(bag_of_words))

In [20]:
len(bag_of_words)

6801

In [21]:
X = []
y = []

for index, row in dataset.iterrows():

    encoding_vector = [0] * len(bag_of_words)

    for token in process_text(row["Message"]):

        if token in bag_of_words:

            index = bag_of_words.index(token)

            encoding_vector[index] += 1

    X.append(encoding_vector)

    y.append(
        list(
            row[
                [
                    "Conservation",
                    "Conformity",
                    "Tradition",
                    "Security",
                    "Self-Transcendance",
                    "Benevolence",
                    "Universalism",
                    "Self-Enhancement",
                    "Power",
                    "Achievement",
                    "Stimulation",
                    "Openness to Change",
                    "Hedonism",
                    "Self-Direction",
                ]
            ]
        )
    )

In [22]:
X = np.array(X)

In [23]:
y = np.array(y)

In [15]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# X_train.shape

In [17]:
# y_train.shape

In [24]:
model = Sequential(
    [   Dense(1024, input_dim=X.shape[1], activation="relu", kernel_regularizer=l2(0.001)),  # Input layer
        Dropout(0.5),  # Dropout for regularization
        Dense(512, activation="relu"),  # Hidden layer
        Dense(y.shape[1], activation="linear"),  # Output layer (14 targets)
    ]
)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["mse"])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
history = model.fit(X, y, epochs=500, batch_size=32, verbose=1)

Epoch 1/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 136ms/step - loss: 29.7245 - mse: 28.0438
Epoch 2/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 192ms/step - loss: 13.0597 - mse: 11.7294
Epoch 3/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - loss: 10.9017 - mse: 9.7600
Epoch 4/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - loss: 7.3606 - mse: 6.3231
Epoch 5/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - loss: 6.0649 - mse: 5.0825
Epoch 6/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - loss: 4.9677 - mse: 4.0126
Epoch 7/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step - loss: 3.7421 - mse: 2.7958
Epoch 8/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - loss: 3.3493 - mse: 2.4039
Epoch 9/500
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/ste

### [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.333333333, 5.0, 6.0]

In [26]:
list(
    dataset[
        [
            "Conservation",
            "Conformity",
            "Tradition",
            "Security",
            "Self-Transcendance",
            "Benevolence",
            "Universalism",
            "Self-Enhancement",
            "Power",
            "Achievement",
            "Stimulation",
            "Openness to Change",
            "Hedonism",
            "Self-Direction",
        ]
    ].values[3]
)

[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.333333333, 5.0, 6.0]

In [24]:
import math

In [27]:
model.save("../svs.keras")

### Load and Use

In [28]:
model = tf.keras.models.load_model('../svs.keras')

In [30]:
testdata = pd.read_excel("../combined.xlsx")

In [28]:
testdata = testdata[~testdata["Conformity"].isnull()]

In [31]:
final_rows = []
columns = list(testdata.columns) + [
    f"predicted_{col}"
    for col in [
        "Conservation",
        "Conformity",
        "Tradition",
        "Security",
        "Self-Transcendance",
        "Benevolence",
        "Universalism",
        "Self-Enhancement",
        "Power",
        "Achievement",
        "Stimulation",
        "Openness to Change",
        "Hedonism",
        "Self-Direction",
    ]
    if col not in ["Messages", "ID"]
]

In [32]:
for index, row in testdata.iterrows():
    input_vector = [0] * len(bag_of_words)
    for token in process_text(row["Message"]):
        if token in bag_of_words:
            index = bag_of_words.index(token)
            input_vector[index] += 1

    output_vector = model.predict(np.array([input_vector]))[0]
    final_rows.append(list(row) + list(output_vector))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [33]:
pd.DataFrame(final_rows, columns=columns).to_excel("../review.xlsx", index=False)

### Implementation on unseen data

In [34]:
unseen = pd.DataFrame()
for folder in glob.glob('../CEO_sample_transcripts/*'):
    for file in glob.glob(f"{folder}/*.csv"):
        unseen = pd.concat([unseen,pd.read_csv(file)])

In [35]:
unseen = unseen[~unseen["Detected Speaker"].str.contains("Reporter")]
unseen = unseen[~unseen["Detected Speaker"].str.contains("Speaker")]

In [36]:
unseen["Detected Speaker"].unique()

array(['Charles Janac', 'Chirag Patel', 'Christopher Richard Anzalone',
       'Chris Urmson', 'Clive T. Johnson', 'Dale Schwartz',
       'Damian Scokin', 'Daniel Houston', 'Daniel Mamadou',
       'Daniel P Mcgahn', 'Daniel Shachar', 'Darren Rebelez',
       'David Bruton Smith', 'David_Jin', 'David Jin'], dtype=object)

In [37]:
unseen = unseen.drop(columns=["Speaker"])

In [38]:
unseen_joined = unseen.groupby('Detected Speaker')['Text'].apply(' '.join).reset_index()

In [39]:
unseen_joined.to_excel("dataset_unseen.xlsx",index=False)

In [41]:
model = tf.keras.models.load_model('../svs.keras')

In [43]:
output = []
columns = list(unseen_joined.columns) + [
    f"predicted_{col}"
    for col in [
        "Conservation",
        "Conformity",
        "Tradition",
        "Security",
        "Self-Transcendance",
        "Benevolence",
        "Universalism",
        "Self-Enhancement",
        "Power",
        "Achievement",
        "Stimulation",
        "Openness to Change",
        "Hedonism",
        "Self-Direction",
    ]
]

In [44]:
for index, row in unseen_joined.iterrows():
    input_vector = [0] * len(bag_of_words)
    for token in process_text(row["Text"]):
        if token in bag_of_words:
            index = bag_of_words.index(token)
            input_vector[index] += 1

    output_vector = model.predict(np.array([input_vector]))[0]
    output.append(list(row) + list(output_vector))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65

In [45]:
pd.DataFrame(output,columns=columns).to_excel("../ceo_transcripts_output.xlsx",index=False)