#### Imports

In [1]:
import pandas as pd
import spacy

# Create the nlp object
nlp = spacy.load('en_core_web_lg')

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Reading in Data

In [3]:
original_df = pd.read_csv('./data/raw_combined_posts.csv')[['url']]
print('Dataframe of raw data has {} rows and {} columns.'.format(original_df.shape[0],
                                                                 original_df.shape[1]))
original_df.head()

Dataframe of raw data has 1403 rows and 1 columns.


Unnamed: 0,url
0,https://www.reddit.com/r/QuantifiedSelf/commen...
1,https://www.reddit.com/r/QuantifiedSelf/commen...
2,https://matiroy.com/writings/How-I-audio-video...
3,https://matiroy.com/writings/Should-I-record-m...
4,https://www.reddit.com/r/QuantifiedSelf/commen...


In [4]:
print("Checking for null values...\n{}\n".format(original_df.isnull().sum()))
print('Checking how many empty string values in url column of dataframe...\n{}'.format(original_df['url'][original_df['url'] == ""].shape[0]))

Checking for null values...
url    0
dtype: int64

Checking how many empty string values in url column of dataframe...
0


In [5]:
df = pd.read_csv('./data/cleaned_text.csv')
print('Dataframe has {} rows and {} columns.'.format(df.shape[0], 
                                                     df.shape[1]))
df.head()

Dataframe has 1403 rows and 1 columns.


Unnamed: 0,text
0,try find Qualia Mind sleep Oura ring mind shar...
1,food app ability retrieve nutritional info tex...
2,audiovideo record life capture bit obtrusive t...
3,major privacy concern little actual benefit pr...
4,introductory stat know study design affect con...


In [6]:
print("Checking for null values...\n{}\n".format(df.isnull().sum()))
print('Checking how many empty string values in text column of dataframe...\n{}'.format(df['text'][df['text'] == ""].shape[0]))

Checking for null values...
text    4
dtype: int64

Checking how many empty string values in text column of dataframe...
0


In [7]:
df = df.fillna("supercalifragilisticexpialidocious")
print("Checking for null values...\n{}\n".format(df.isnull().sum()))
print('Checking how many empty string values in text column of dataframe...\n{}'.format(df['text'][df['text'] == ""].shape[0]))

Checking for null values...
text    0
dtype: int64

Checking how many empty string values in text column of dataframe...
0


In [8]:
def top_posts(df):
    
    index_to_use = input("Describe what you are interested in quantifying. ")
    print('\nSearching {} posts...\n'.format(df.shape[0]))
    
    main_doc = nlp(index_to_use)
    
    lst_similarities = []
    for number in range(df.shape[0]):
        doc = nlp(df['text'][number])
        similarity = (main_doc.similarity(doc), number)
        if similarity[0] > 0:
            lst_similarities.append(similarity)

    lst_similarities.sort(reverse = True)
    index_lst = []
    similarity_score = []
    for similarity_tuple in lst_similarities[:5]:
        index_lst.append(similarity_tuple[1])
        similarity_score.append(similarity_tuple[0])
    
    for index, score in zip(index_lst, similarity_score ):
        print("Recommended url:\n", original_df['url'][index], "\nSimilarity Score: ", score)
        
    return

In [11]:
top_posts(df)

Describe what you are interested in quantifying. sleep tracking

Searching 1403 posts...

Recommended url:
 http://myquantifiedbrain.com/ 
Similarity Score:  0.8278694614280998
Recommended url:
 http://www.mobihealthnews.com/content/dont-sleep-it-time-now-digital-tools-improve-sleep 
Similarity Score:  0.8101235994360837
Recommended url:
 https://www.reddit.com/r/QuantifiedSelf/comments/acn7pr/any_recommendations_for_sleep_trackers_that_allow/ 
Similarity Score:  0.7768527479302243
Recommended url:
 https://www.reddit.com/r/QuantifiedSelf/comments/26cnw8/if_i_want_to_track_sleep_should_i_go_for_a_psg_or/ 
Similarity Score:  0.7683254834695593
Recommended url:
 https://hello.is/ 
Similarity Score:  0.7660957715228757
