# Thematic Analysis

This notebook contains Python code samples for analyzing interview transcript data, 
focusing on themes of trust and reliability.

We will perform various operations including data loading, filtering, sentiment analysis, 
keyword identification, and data aggregation.

In [5]:
import pandas as pd
from IPython.display import display as dp

# Load your data
file_path = 'data/annotations.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(df.head())

   Unnamed: 0        speaker                  theme  \
0           0  Robert.Lehman  Purchasing Experience   
1           1  Robert.Lehman   Educational Policies   
2           2  Robert.Lehman      Digital Resources   
3           3  Robert.Lehman      Budget and Timing   
4           4  Robert.Lehman          Buying Habits   

                                             context  sentiment_score brand  \
0  Robert mentions the difficulties of purchasing...             -0.4   NaN   
1  Robert discusses how educational policies infl...             -0.3   NaN   
2  Robert talks about the shift towards digital t...              0.1   NaN   
3  Robert explains the budgeting process within t...              0.0   NaN   
4  Robert explains his decision to spend out of p...              0.2   NaN   

  identified_purchases start_time end_time                    email  ...  \
0                   []      06:04    07:03  robert.lehman@pgcps.org  ...   
1                   []      06:04    07:03

In [14]:
trust_themes_df = pd.read_csv('data/trust_themes_anon.csv', index_col=0)
trust_themes_df

Unnamed: 0,theme,context,sentiment_score,brand,identified_purchases,start_time,end_time,ResponseId,FirstName,LastName,...,BioRad Familiarity,BioCorp Familiarity,Amazon Familiarity,Nasco Familiarity,Frey/School Specialty Familiarity,Primary Vendor,Top Vendor Qualities,Years in Eduacation,interview_id,snippet_anon
0,Purchasing Experience,Robert mentions the difficulties of purchasing...,-0.4,,[],06:04,07:03,R_2YLFeZ1mpUDw7L9,Robert,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Interviewer 1 (06:04):\n\nOf range are you usu...
4,Buying Habits,Robert explains his decision to spend out of p...,0.2,,[],03:21,03:44,R_2YLFeZ1mpUDw7L9,Robert,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Educator (03:21):\n\nReason that I chose to do...
5,Vendor Comparison,Flynn has a better selection for the use of Ve...,0.3,Flinn Scientific,[ProductPurchaseDetail(product_name='Vernier s...,16:46,17:18,R_2YLFeZ1mpUDw7L9,Robert,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,"Educator (16:46):\n\nFlynn in my opinion, has ..."
6,Carolina Purchases,"Carolina usually provides the kits, like ones ...",0.0,Carolina Biological Supply,[ProductPurchaseDetail(product_name='kits cont...,17:18,18:03,R_2YLFeZ1mpUDw7L9,Robert,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,Educator (17:18):\n\nCarolina usually are the ...
7,Product Quality,He prefers real scientific exploration over th...,0.2,,[],13:54,15:24,R_2YLFeZ1mpUDw7L9,Robert,Lehman,...,Aware of (don't use),Aware of (don't use),Aware of (don't use),Aware of (don't use),Never heard of,"Carolina Biological,Flinn Scientific",District approved vendor,4-9 years,23,"Educator (13:54):\n\nWell, the kits, well in p..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,Customer Experience,Has had experiences with kits that were not or...,-0.4,,[],44:29,44:29,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Educator (44:29):\n\nI definitely have ordered...
897,Customer Experience,Desires a website with a collection of accessi...,0.8,,[],47:41,47:41,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,Educator (47:41):\n\nGuess it would just be in...
899,Customer Service,Would utilize a safety video included in a kit...,0.6,,[],50:13,50:13,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,"Educator (50:13):\n\nSure. I'd use that, espec..."
900,Product Quality,"Prefers diversity of results in experiments, a...",0.5,,[],51:20,51:20,R_11bIskMPuh55EKW,Gregory,Ruber,...,Aware of (don't use),Never heard of,Current Vendor,Current Vendor,Never heard of,"Carolina Biological,Amazon","District approved vendor,Free shipping (unlimi...",4-9 years,9,"Educator (51:20):\n\nI mean, obviously I would..."


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Extracting the snippets for NLP analysis
snippets = trust_themes_df['snippet_anon'].dropna()

# Setting up a CountVectorizer for text processing
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
transformed_data = vectorizer.fit_transform(snippets)

# Using Latent Dirichlet Allocation for topic modeling
n_topics = 5  # Assuming 5 main topics for initial exploration
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(transformed_data)

# Function to display top words for each topic
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d:" % (topic_idx)] = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dict

no_top_words = 10
topics = display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

topics


{'Topic 0:': 'like just interviewer know don okay really think going ve',
 'Topic 1:': 'interviewer like just amazon okay carolina think order things don',
 'Topic 2:': 'science interviewer really like lot teacher school ve stuff just',
 'Topic 3:': 'things like interviewer just kind year use going ve okay',
 'Topic 4:': 'just don really interviewer like think school know yeah need'}

### Word Frequency Analysis

Clean up the text and create a word list.

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re

text = ''.join(df['snippet'])
def remove_lines_with_timestamps(text):
    # Remove lines with Daylene or Kimberly
    text = re.sub(r'.*?(Daylene|Kimberly).*\n?', '', text, flags=re.MULTILINE)
    # Remove remaining lines that contain timestamps
    # Matches lines with patterns like "(16:46)"
    return re.sub(r'.*?\(\d{2}:\d{2}\).*\n?', '', text, flags=re.MULTILINE).strip()

def get_sentences(df):
    text = ''.join(df['snippet'])
    # Normalize case and remove punctuation
    text = remove_lines_with_timestamps(text).lower()
    # Remove commas
    text = text.replace(',', '')
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove non-alphabetic tokens
    text = re.sub(r'\W+', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Tokenize the text
    tokens = sent_tokenize(text)
    return tokens

def get_words(df):
    text = ''.join(df['snippet'])
    # Normalize case and remove punctuation
    text = remove_lines_with_timestamps(text).lower()
    # Remove commas
    text = text.replace(',', '')
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove non-alphabetic tokens
    text = re.sub(r'\W+', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens


words = get_words(df)
sentences = get_sentences(df)
sentences[:5]

In [None]:
# Frequency distribution
from nltk.probability import FreqDist

# Create the frequency distribution of the remaining tokens
all_tokens = words
fdist = FreqDist(all_tokens)

fdist.most_common(10)
fdist.plot(10)
fdist.tabulate(10)
print(f"Number of unique tokens: {fdist.N()}")
print(f"Number of documents: {fdist.B()}")
print(f"Total number of tokens: {fdist.N()}")
print(f"Lexical diversity: {fdist.B() / fdist.N()}")
print(f"Lexical density: {fdist.N() / fdist.B()}")
print(f"Lexical density: {fdist.N() / fdist.B()}")
print(f"Lexical density: {fdist.N() / fdist.B()}")



In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures


def get_bigrams_from_df(df):
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(get_words(df))

    # Find bigrams that include the word 'trust'
    finder.apply_ngram_filter(lambda w1, w2: 'trust' not in (w1.lower(), w2.lower()))
    
    # Score the bigrams by frequency
    scored = finder.score_ngrams(bigram_measures.pmi)
    
    # Sort highest to lowest based on the score
    scoredList = sorted(scored, key=lambda x: x[1], reverse=True)
    
    return scoredList

bigrams = get_bigrams_from_df(df)
bigrams

# plot the word and strength of association using nltk and plotly

import plotly.express as px




In [None]:
bigram_df = pd.DataFrame(bigrams)
# create separate columns for the bigram tokens
bigram_df['word1'] = bigram_df[0].apply(lambda x: x[0])
bigram_df['word2'] = bigram_df[0].apply(lambda x: x[1])
# rename the columns
bigram_df.columns = ['bigram', 'strength', 'word1', 'word2']
# drop the bigram column
bigram_df.drop(columns='bigram', inplace=True)

# create a column with just the associated word, not trust
bigram_df['word'] = bigram_df.apply(lambda x: x['word1'] if x['word1'] != 'trust' else x['word2'], axis=1)

bigram_df.head()

In [None]:
# display as word cloud

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# display word cloud with relative importance
wordcloud = WordCloud().generate_from_frequencies(fdist)
plt.imshow(wordcloud, interpolation='bilinear')


In [None]:
average_sentiment_by_speaker = df.groupby('speaker')['sentiment_score'].mean()

# Display the results
print(average_sentiment_by_speaker)