In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim import corpora
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag

In [2]:
sheets = ['episode1', 'episode2', 'episode3', 'episode4', 'episode5', 'episode6']
data = pd.concat([pd.read_excel('data.xlsx', sheet_name = sheet) for sheet in sheets])

In [27]:
(data
 .assign(dialogue=lambda df_: df_.dialogue.str.replace('\s+', ' ', regex=True).str.strip())
 .assign(word_count=lambda df_: df_.dialogue.str.split().str.len())
 .assign(dialogue=lambda df_: df_.dialogue.apply(lambda df_: df_.strip()))
 .assign(sorted_pairs=lambda df_: df_.apply(lambda df_: '-'.join(sorted([df_['initiator'], df_['responder']])), axis=1))
 .assign(weight=lambda df_: df_.groupby(['sorted_pairs'])['sorted_pairs'].transform('count'))
 .assign(dialogue=lambda df_: df_.groupby(['sorted_pairs'])['dialogue'].transform(lambda df_: ' '.join(df_.unique())))
 .drop_duplicates('sorted_pairs')
 .drop(columns='sorted_pairs')
)

Unnamed: 0,initiator,responder,dialogue,word_count,weight
0,PASSENGER 1,SHANE,You headed home? Yeah We were at the Amanari. ...,105,1
1,OLIVIA,PAULA,"Oh my God, who are these people? So, these two...",119,31
2,NICOLE,OLIVIA,"Hey, girls. What, Mom? Liv, come up front. I t...",20,57
3,ARMOND,LANI,"Here they come. Wave, Lani. There we are. Wave...",199,6
4,BELINDA,NICOLE,"Welcome. Thank you. Of course, yeah.",3,2
...,...,...,...,...,...
55,HOTEL GUEST,TANYA,How’s it going? It’s hot today. What’s good? M...,21,1
64,NICOLE,FAMILY,We are so lucky to be here all together as a f...,12,1
68,JOHN,ARMOND,"John, how are you? Yes. Okay, well– All right,...",45,1
98,SHANE,BARTENDER,"Double tequila. Sure, sounds good. Coming righ...",8,1


In [None]:
import pandas as pd
import gensim
from gensim import corpora
import nltk
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

# create a list of documents from the 'dialogue' column
documents = list(df['dialogue'])

# define stop words, tags to remove, and words to remove
stop_words = stopwords.words('english')
removal = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
remove_words = ['like', 'gone', 'know', 'right', 'na', 'gon', 'yeah', 'really', 'okay',
                'get', 'gonna', 'well', 'thank', 'oh', 'uh', 'hi', 'got', 'um', 'go',
                'would', 'great', 'come', 'hey', 'wanna', 'hmm', 'mr', 'yes', 'good']

# remove stop words and unwanted words
tokenized_docs = [[token for token in gensim.utils.simple_preprocess(doc) if token not in stop_words and token not in remove_words] for doc in documents]

# remove tags
tagged_docs = [nltk.pos_tag(doc) for doc in tokenized_docs]
tokenized_docs = [[token for token, pos in doc if pos not in removal] for doc in tagged_docs]

# create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(tokenized_docs)

# create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# train an LDA model on the corpus
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=5,
                                            random_state=42,
                                            passes=10)

# extract the top fifteen words for each topic
topic_labels = []
for doc in tokenized_docs:
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    top_fifteen_words = [dictionary[word] for word, prob in sorted(lda_model.get_topic_terms(max(topics, key=lambda x: x[1])[0], topn=15), key=lambda x: x[1], reverse=True)]
    topic_labels.append(top_fifteen_words)

# add the topic labels as a new column in the dataframe
df['topic'] = topic_labels
df

In [None]:
df.iloc[64, 4]

In [None]:
# Define the names to create DataFrames for
chosen_names = ['NICOLE', 'TANYA', 'RACHEL', 'SHANE', 'ARMOND', 'PAUL', 'OLIVIA']

# Loop through the chosen names and create a new DataFrame for each one
for name in chosen_names:
    # Select rows where the name appears in either initiator or responder
    new_df = df[(df['initiator'] == name) | (df['responder'] == name)]
    
    # Create a new DataFrame with the selected rows
    globals()[f'{name}_df'] = pd.DataFrame(new_df)

In [None]:
NICOLE_df

In [None]:
TANYA_df