In [1]:
import pandas as pd
import gensim.downloader

pd.set_option('display.max_colwidth', None)
w2v = gensim.downloader.load('word2vec-google-news-300')

In [7]:
df = pd.read_csv("../transcripts/mrs-r-codes.csv")

df["Codes"] = df["Codes"].str.lower()
df["Codes"] = df["Codes"].str.replace(r",\s+", ",", regex=True)
df["Codes"] = df["Codes"].str.replace(r"code [0-9]+: ", "", regex=True)
df["Codes"] = df["Codes"].str.strip(",")


In [8]:
def filter_codes(code: str):
    code_df = df[df["Codes"].str.contains(code)]
    codes = code_df["Codes"].str.split(",")
    codes = list(set(codes.explode("Codes")))

    return codes

def rank_by_similarity(code: str, code_list: list):
    code_dict = {}

    for c in code_list:
        try:
            score = w2v.similarity(code, c)
        except KeyError:
            score = 0

        code_dict[c] = score

    code_ranked = pd.DataFrame(code_dict.items(), columns=["Code", "Similarity Score"])
    code_ranked.sort_values("Similarity Score", ascending=False, inplace=True)

    return code_ranked


In [9]:
spouse_codes = filter_codes("spouse")
spouse_ranked = rank_by_similarity("spouse", spouse_codes)
spouse_ranked[:10]


Unnamed: 0,Code,Similarity Score
0,spouse,1.0
24,marriage,0.449706
7,children,0.342192
16,parenting,0.340154
23,happiness,0.278704
18,home,0.245488
12,contentment,0.238908
27,college,0.208411
5,religion,0.199689
22,childhood,0.182481


In [10]:
food_codes = filter_codes("food")
food_ranked = rank_by_similarity("food", food_codes)
food_ranked[:10]

Unnamed: 0,Code,Similarity Score
6,food,1.0
11,gardening,0.273421
9,outdoors,0.217245
7,travel,0.182939
15,worry,0.160577
14,aging,0.072066
16,ww2,0.062056
5,spouse,0.035875
13,cultural appreciation,0.0
12,adventurous spirit,0.0


In [13]:
social_programs_codes = filter_codes("social programs")
social_programs_ranked = rank_by_similarity("social programs", social_programs_codes)
social_programs_ranked[:10]

Unnamed: 0,Code,Similarity Score
0,post-war america,0
1,the depression,0
2,marriage,0
3,imparting wisdom,0
4,gratitude,0
5,spouse,0
6,social programs,0
7,parenting,0
8,hard times,0
9,food,0


In [14]:
travel_codes = filter_codes("travel")
travel_ranked = rank_by_similarity("travel", travel_codes)
travel_ranked[:10]

Unnamed: 0,Code,Similarity Score
2,travel,1.0
5,food,0.182939
16,freedom,0.158407
6,religion,0.10829
14,teaching,0.105806
19,happiness,0.103377
17,aspirations,0.098571
4,children,0.091953
12,parenting,0.091603
18,childhood,0.089115


In [15]:
social_connections_codes = filter_codes("social connections")
social_connections_ranked = rank_by_similarity("social connections", social_connections_codes)
social_programs_ranked[:10]

Unnamed: 0,Code,Similarity Score
0,post-war america,0
1,the depression,0
2,marriage,0
3,imparting wisdom,0
4,gratitude,0
5,spouse,0
6,social programs,0
7,parenting,0
8,hard times,0
9,food,0
