# Talk Like IDEO: similarity scores
- Look at which articles in our dataset have similar language to what IDEO uses
- 1. Create a single document that combines all IDEO Journal articles
- 2. Early experimentation gensim dictionaries, BOW models, TF-IDF, and Doc2Vec models
- 3. Creates scatter plots with cosine similarity to IDEO reference and to "good leads" refrence 



## NOTE: 
### This was an analysis path we chose not to pursue further
### a lot of these functions have been improved in subsequent notebooks! (especially notebook #9)




In [None]:
# Jupyter magic
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# imports
import pandas as pd
import numpy as np

import pathlib

import altair as alt
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.corpora.textcorpus import TextCorpus
from gensim.parsing.preprocessing import STOPWORDS


from sklearn.metrics.pairwise import cosine_similarity

# Load Stopwords

In [None]:
def load_stopwords(stopwords_path = "../1_data/stopwords/stopwords.txt"):
    with open(stopwords_path, "r") as f:
        sw = f.read().split()
    return STOPWORDS.union(set(sw))

stopwords = load_stopwords()
print(f"{len(stopwords)} stopwords")


## Load IDEO docs


In [None]:
def read_dir_text_files(data_dir, verbose=True):

    skiplist = ['.DS_Store', "index.txt", "index.csv", "index"]

#     articles = {}
    files = []
    texts = []
    p = pathlib.Path(data_dir)
    for article_path in p.glob('*'):
        if article_path.is_dir():
            continue
        fname = article_path.name
        fname = fname.split('.txt')[0]
        if fname in skiplist:
            continue
        txt = article_path.read_text()
        files.append(fname)
        texts.append(txt)
    if verbose:
        print(f"{len(texts)} docs found in {data_dir}")
    return files, texts

def preprocess(text, stopwords=stopwords):
    tokens = gensim.utils.simple_preprocess(text, 
                                            deacc=True,
                                            min_len=3,
                                           )
    tokens = [t for t in tokens if t not in stopwords]
    return tokens


# set the path
ideo_path = '../1_data/IDEO_journal'

# read all the files
ideo_titles, ideo_docs = read_dir_text_files(ideo_path)

# combine into one big text block
ideo_journal_single_big_text = "\n".join(ideo_docs)

# preprocess into tokens
ideo_doc_tokens = preprocess(ideo_journal_single_big_text)


## Load Good_Leads docs



In [None]:
# set the path
gl_path = '../1_data/good_lead_articles/'

# read all the files
gl_files, gl_docs = read_dir_text_files(gl_path)

# combine into one big text block
gl_single_big_text = "\n".join(gl_docs)

# preprocess into tokens
gl_tokens = preprocess(gl_single_big_text)
len(gl_tokens)

## Load & Preprocess all the documents

So far it's just get their text and tokenize it

also save the titles and directories of the docs (for later)

In [None]:
def add_files_from_dir(paths, verbose=True):

    all_files, all_docs, all_paths = [], [], []
    for p in paths:
        files, docs = read_dir_text_files(p, verbose=verbose)
        all_files += files
        all_docs += docs
        all_paths +=  ([p.split('/')[-2]] * len(docs)) # repeates the name of the directory for each item
        
    return all_files, all_docs, all_paths


# grab all the files from these directories and make some tokens
input_paths = [
        "../1_data/how_i_built_this/",
#         "../1_data/example_articles/",
        "../1_data/NYT_corner_office/",
        "../1_data/good_lead_articles/",
        "../1_data/current_transformational_client_articles/",
        ]

files, docs, paths = add_files_from_dir(input_paths)
tokens = [preprocess(doc) for doc in docs]
extended_tokens = ([preprocess(doc) for doc in ideo_docs] + 
                   [preprocess(doc) for doc in gl_docs] + 
                   tokens)

# Create an index 
It's going to be a dataframe that stores:
- additional information about a document - interviewee, title, source, year, etc. 
- similarity scores
- file paths (temporary link)


In [None]:
# str(df_index[df_index['year'].isna()].iloc[0])

In [None]:
def assemble_index_files(input_paths):
    indexes = []
    for p in input_paths:
#         dirname = pathlib.Path(p).name
        index_path = pathlib.Path(f"{p}/index.csv")
        if not index_path.exists():
            print(f"index.csv does not exist in {p}")

        df_index_part = pd.read_csv(index_path)
#         df_index_part['group'] = dirname
        print(p, len(df_index_part))
        indexes.append(df_index_part)


    df_index = pd.concat(indexes, axis=0)
    fnames = [str(f).split('.txt')[0] for f in df_index['filename']]
    df_index['filename'] = fnames
    return df_index.drop(columns=['Unnamed: 0'])


def check_filename_index_conflicts(df_index):
    
    # reporting
    before_dedupe = df_index['filename'].value_counts()
    dups = []
    for k, v in before_dedupe.items():
        if v > 1:
            dups.append((k, v))
    if len(dups):
        print(f'{len(dups)} filenames with overlapping index')
        for d in dups:
            print(f"{d[0]}: {d[1]} references")

        print('\nremoving duplicates from index')
        return df_index.drop_duplicates(subset=['filename'], keep="last")
    
    return df_index
            

# load all the index files
df_index_a = assemble_index_files(input_paths)

df_index_a['year'] = df_index_a['year'].fillna(0).astype(int)

new_titles = []
df_index_a.rename(columns={'title': 'headline'}, inplace=True)
for i, row in df_index_a.iterrows():
    name = row['name']
    year = row['year']
    new_titles.append(f"{name} ({year})")
df_index_a['title'] = new_titles

# df_index_a['year'] = df_index_a['year'].astype(int)
# Watch out! Duplicates!
#The how I built this is messy! there's a couple episodes with the same people that then overwrite their files.
#quickfix is I'm removing them from the index for now. longer fix would be to actually fix things and give them slightly different names

    
df_index_a = check_filename_index_conflicts(df_index_a)
df_index_a.head()



## get index to reflect the order of document, text, token lists


In [None]:
# this is a temporary df used to create a match between:
# 1) names of the files 
# 2) text in (docs and tokens list)
# The order in which documents are loaded might not reflect the order in the df_index
df_index_b = pd.DataFrame([files, paths], index=['filename', 'group']).T
df_index_b.head()

In [None]:
# check for duplicate filenames
df_index_b[df_index_b['filename'].duplicated()]

### Check overlaps

In [None]:
files_not_in_indexed = list(set(df_index_b['filename']) - set(df_index_a['filename']))
index_without_files = list( set(df_index_a['filename']) - set(df_index_b['filename']))
overlap = list( set(df_index_a['filename']) | set(df_index_b['filename']))


print(f'{len(files_not_in_indexed)} files not referenced by an index.csv')
print(f'{len(index_without_files)} indexes missing a file')
print()
print(f'{len(overlap)} overlaps')

In [None]:
# i = pd.read_csv('../1_data/current_transformational_client_articles/index.csv')
# i
df_index_a.set_index('filename').loc[index_without_files]



In [None]:
missing_index = df_index_b.set_index('filename').loc[files_not_in_indexed].sort_values(by='group')
# missing_index[missing_index['dir'] == "current_transformational_client_articles"]
missing_index

In [None]:
df_index_a.set_index('filename').loc[index_without_files]

### Merge to create final index

In [None]:
# 
# 

In [None]:
# df_index = df_index_b.set_index("filename").join(df_index_a.set_index('filename'), how='left')

a = df_index_a.set_index('filename')
b = df_index_b.set_index("filename")

df_index = b.join(a, how='left')
df_index.shape, df_index_b.shape, df_index_a.shape

In [None]:
df_index_b[df_index_b['filename'].duplicated()]
# df_index_b[df_index_b['filename'] == 'marvin_chiefexecutive_2018']

### spotcheck

In [None]:
index_order = df_index.index.to_list()
file_order = df_index_b['filename'].to_list()
for i in range(len(df_index)):
    filename1 = index_order[i]
    filename2 = file_order[i]
    if filename1 != filename2:
        print('ERROR', i, filename1, filename2)


In [None]:
import random

def check_doc_index_match():

#     d = 'good_lead_articles'
#     subsample = df_index[df_index['group'] == d]
    subsample = df_index
    random_article = random.sample(list(subsample.index), 1)[0]
    i = df_index.index.get_loc(random_article)
    print(i, random_article)
    print(df_index.loc[random_article, 'name'])
    print('---')
    print(docs[i])
    
check_doc_index_match()

## Make gensim dictionary

gensim object that lets us do doc2bow functions.

Keeps track of tokens + ids for each token.



In [None]:
# Could combine
full_dictionary = gensim.corpora.Dictionary([ideo_doc_tokens, gl_tokens])
# Right now this is just the ideo words and Good Leads words...

# add all the tokens from the documents?
# full_dictionary.add_documents(tokens)

len(full_dictionary), type(full_dictionary)

## Make Corpus object & Similarity Model

corpus is a list where each element is a BOW (list with word frequencies)

Term Frequency – Inverse Document Frequency(TF-IDF) is also a bag-of-words model but unlike the regular corpus, TFIDF down weights tokens (words) that appears frequently across documents.

In [None]:
# Create Bag of Words for IDEO + Corpus
ideo_bow = full_dictionary.doc2bow(ideo_doc_tokens)
gl_bow = full_dictionary.doc2bow(gl_tokens)
corpus = [full_dictionary.doc2bow(token) for token in tokens]

# Add IDEO to the corpus and mark it's postion
full_corpus = [ideo_bow, gl_bow] + corpus
ref_names = ['ideo', 'gl']
ref_indicies= [0, 1]


In [None]:
def similarity_to_refs(corpus, ref_indicies=[0,1]):

    sims = gensim.similarities.Similarity(output_prefix = 'workdir/',
                                          corpus=corpus,
                                          num_features=len(full_dictionary))
    
    sim_to_corpus = []
    sim_between_refs = []
    for i in ref_indicies:
        reference_array = corpus[i]
        similarity_to_refrence = sims[reference_array]
        assert similarity_to_refrence[i] > 0.99
        clean_similarity = np.delete(similarity_to_refrence, ref_indicies)
        sim_to_corpus.append(clean_similarity)
        sim_between_refs.append(similarity_to_refrence[ref_indicies]) 
        
    return sim_to_corpus, sim_between_refs
   
    
    
    
    
    
sim_name = 'bow'
new_cols = [f"sim_{sim_name}_{ref_name}" for ref_name in ref_names]
sims, ref_sims  = similarity_to_refs(corpus=full_corpus, ref_indicies=ref_indicies)
for col,sim  in zip(new_cols, sims):
    df_index[col] = sim
    
    fig, ax = plt.subplots()
    
    sns.histplot(sims[0], ax=ax)
    ax.set_title(col)
    plt.show()
    
fig, ax = plt.subplots()
ax.plot(sims[0],sims[1], '.')
plt.show()

In [None]:
# 
tf_idf = gensim.models.TfidfModel(full_corpus)


sim_name = 'tfidf'
transformed_corpus = tf_idf[full_corpus]


new_cols = [f"sim_{sim_name}_{ref_name}" for ref_name in ref_names]
sims, ref_sims  = similarity_to_refs(corpus=transformed_corpus, ref_indicies=ref_indicies)
for col,sim  in zip(new_cols, sims):
    df_index[col] = sim
    
    fig, ax = plt.subplots()
    
    sns.histplot(sims[0], ax=ax)
    ax.set_title(col)
    plt.show()
    
fig, ax = plt.subplots()
ax.plot(sims[0],sims[1], '.')
ax.set_xlabel('sim to ideo')
ax.set_ylabel('sim to leads')
plt.savefig('similarity comparison.png')
plt.show()

In [None]:
def create_interactive_scatter(df, x, y, savename=None):

    color_col = 'group'

    selection = alt.selection_multi(fields=[color_col])

    color = alt.condition(selection,
                          alt.Color(f'{color_col}:N', legend=None),
                          alt.value('lightgray'))

    scatter = alt.Chart(df_index).mark_circle(size=60).encode(
        x=x,
        y=y,
        color=color,
        href='url',
        tooltip=["title", 'headline', 'group', 'url']
    ).interactive()


    legend = alt.Chart(df_index).mark_point().encode(
        y=alt.Y(f'{color_col}:N', axis=alt.Axis(orient='right')),
        color=color
    ).add_selection(
        selection
    )

    if savename is not None:
        (scatter | legend).save(savename)
    
    return scatter | legend

create_interactive_scatter(df=df_index, x='sim_tfidf_ideo', y='sim_tfidf_gl', 
                           savename='tfidf.html')

In [None]:
df_index.columns

## Actually pretty bad correlation between similarity scores

In [None]:
# fig, ax = plt.subplots()
# ax.plot(df_index['similarity_bow'], df_index['similarity_tfidf'], '.')
# # ax.set_xlim(0, 1)
# # ax.set_ylim(0,1)
# ax.set_ylabel('tf idf')
# ax.set_xlabel('bow word counts')
# ax.set_title('correlation between similarities')
# plt.show()

# Mess with Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy.spatial.distance import cosine

In [None]:
tagged_data = [TaggedDocument(words=t, tags=[str(i)]) for i, t in enumerate(tokens)]
len(tagged_data)

In [None]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =0)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v_dm0_short.model")
print("Model Saved")

In [None]:
def similarity(doc_vectors, ref_vector):
    return [1 - cosine(ref_vector, doc_vectors[i]) for i in range(len(tokens))]



# model_file = "d2v_short.model"
# model_file = "d2v_long.model"
model_file = "d2v_dm0_short.model"
# model_file = "d2v_dm0_long.model"

sname = model_file.split(".")[0]

model= Doc2Vec.load(model_file)


ideo_vector = model.infer_vector(ideo_doc_tokens)
gl_vector = model.infer_vector(gl_tokens)

y_name = 'sim_doc2vec_ideo'
x_name = 'sim_doc2vec_gl'

df_index[x_name] = similarity(doc_vectors=model.docvecs, ref_vector=ideo_vector)
df_index[y_name] = similarity(doc_vectors=model.docvecs, ref_vector=gl_vector)


create_interactive_scatter(df=df_index, x=x_name, y=y_name, 
                           savename=f'{sname}.html')

In [None]:
len(docs[0]), len(tokens[0])

In [None]:
x_name = 'sim_doc2vec_ideo'
y_name = 'sim_tfidf_ideo'

sname = 'ideo_similarity_diff'

create_interactive_scatter(df=df_index, x=x_name, y=y_name, 
                           savename=f'{sname}.html')

In [None]:
x_name = 'sim_doc2vec_gl'
y_name = 'sim_tfidf_gl'

sname = 'gl_similarity_diff'

create_interactive_scatter(df=df_index, x=x_name, y=y_name, 
                           savename=f'{sname}.html')

In [None]:
# df_index
char_counts = []
token_counts = []
unique_tokens = []

for i, (doc, tok) in enumerate(zip(docs, tokens)):
    char_counts.append(len(doc))
    token_counts.append(len(tok))
    unique_tokens.append(len(set(tok)))

df_index['char_count'] = char_counts
df_index['token_counts'] = token_counts
df_index['unique_token'] = unique_tokens


In [None]:
# sub_df = df_index[df_index['group'] == 'good_lead_articles']
# fig, ax = plt.subplots()
# ax.plot(sub_df['char_count'], sub_df['sim_tfidf_gl'], '.')
# plt.show()

fig, ax = plt.subplots()
ax.plot(sub_df['token_counts'], sub_df['sim_tfidf_gl'], '.')
plt.show()

# fig, ax = plt.subplots()
# ax.plot(sub_df['token_counts'], sub_df['char_count'], '.')
# plt.show()

# Viz Customization Code
## TODO: add to python scripts

In [None]:
# from https://github.com/altair-viz/altair/issues/1422

import altair as alt
import pandas as pd

two_charts_template = """
<!DOCTYPE html>
<html>
<head>
  <script src="https://cdn.jsdelivr.net/npm/vega@{vega_version}"></script>
  <script src="https://cdn.jsdelivr.net/npm/vega-lite@{vegalite_version}"></script>
  <script src="https://cdn.jsdelivr.net/npm/vega-embed@{vegaembed_version}"></script>
</head>
<body>

<div id="vis1"></div>
<div id="vis2"></div>

<script type="text/javascript">
  vegaEmbed('#vis1', {spec1}).catch(console.error);
  vegaEmbed('#vis2', {spec2}).catch(console.error);
</script>
</body>
</html>
"""


df = pd.DataFrame({'x': range(5), 'y': range(5)})

chart1 = alt.Chart(df).mark_point().encode(x='x', y='y')
chart2 = alt.Chart(df).mark_line().encode(x='x', y='y')

with open('charts.html', 'w') as f:
    f.write(two_charts_template.format(
        vega_version=alt.VEGA_VERSION,
        vegalite_version=alt.VEGALITE_VERSION,
        vegaembed_version=alt.VEGAEMBED_VERSION,
        spec1=chart1.to_json(indent=None),
        spec2=chart2.to_json(indent=None),
    ))