In [1]:
import pandas as pd
import numpy as np
import json
import random
import datetime
from utils import flatten

In [2]:
# JSON file from parsing the results of the qualtrics survey designed for extracting phenotypes from text.
QUALTRICS_RESULTS_JSON_PATH = "../qualtrics/results_json_files/test_results.json"

# What the survey was built from, the IDs in this refer to unique text strings so that duplicates weren't part of it.
SURVEY_SOURCE_DATA_PATH = "../data/4_binned_and_blocked_texts.csv"

# This is the original data with duplicate text strings because IDs refer to particular SNPs from particular genes.
ORIGINAL_DATA_PATH = "../data/2_snps_and_cleaned_text.csv"

# Where to send the resulting output dataframes as CSV files.
random_six_digit_number = random.randrange(100000,999999)
datetime_str = datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S')
key = "{}_{}".format(random_six_digit_number, datetime_str)

RESPONSES_OUTPUT_PATH = "../data/5_processed_survey_responses_{}.csv".format(key)
SNPS_AND_SNIPPETS_OUTPUT_PATH = "../data/6_snps_and_snippets_{}.csv".format(key)

In [4]:
# Make sure the mapping between the unique texts used to create the survey and the original SNP data exists.
original_df = pd.read_csv(ORIGINAL_DATA_PATH)
survey_df = pd.read_csv(SURVEY_SOURCE_DATA_PATH)
text_to_unique_text_id = dict(zip(survey_df.text, survey_df.id))
unique_text_id_to_text = dict(zip(survey_df.id, survey_df.text))
original_df["id"] = original_df["text"].map(text_to_unique_text_id)
original_df.head(20)

Unnamed: 0,gene,snp,text,id
0,AANAT,Rs28936679,"rs28936679, also known as Ala129Thr or A129T (...",1.0
1,AANAT,Rs3760138,Genetic differences in human circadian clock g...,2.0
2,AANAT,Rs4238989,Genetic differences in human circadian clock g...,3.0
3,ABCA1,Rs1800977,The -14C->T polymorphism rs1800977 of the ABCA...,4.0
4,ABCA1,Rs1883025,Apolipoprotein E levels in cerebrospinal fluid...,5.0
5,ABCA1,Rs2020927,"rs2297404, rs2230808, and rs2020927 haplotype ...",6.0
6,ABCA1,Rs2066714,Apolipoprotein E levels in cerebrospinal fluid...,7.0
7,ABCA1,Rs2066715,Apolipoprotein E levels in cerebrospinal fluid...,8.0
8,ABCA1,Rs2230806,"rs2230806, also known as Arg219Lys or R219K, i...",9.0
9,ABCA1,Rs2230808,"rs2297404, rs2230808, and rs2020927 haplotype ...",10.0


In [5]:
# Create a dictionary using the JSON data output from a qualtrics survey.
with open(QUALTRICS_RESULTS_JSON_PATH) as f:
    responses = json.load(f)
responses

[{'response_id': 'R_ylqfW7N5mTbCAdr',
  'recorded_datetime': '2020-09-18 08:51:41',
  'status': 'IP Address',
  'progress': 100,
  'is_finished': 'True',
  'duration': 57,
  'hilights': [{'qid': 5379,
    'selection': 'Von Willebrand disease',
    'selection_index': 10},
   {'qid': 1088, 'selection': 'Cystic Fibrosis', 'selection_index': 0},
   {'qid': 1087, 'selection': 'ystic Fibrosi', 'selection_index': 1},
   {'qid': 4307,
    'selection': 'Hirschsprung disease',
    'selection_index': 156}]},
 {'response_id': 'R_cNnkm3qGUXV4il3',
  'recorded_datetime': '2020-09-18 08:52:30',
  'status': 'IP Address',
  'progress': 100,
  'is_finished': 'True',
  'duration': 40,
  'hilights': []},
 {'response_id': 'R_2ZPwp88WejcgblC',
  'recorded_datetime': '2020-09-18 08:58:38',
  'status': 'IP Address',
  'progress': 100,
  'is_finished': 'True',
  'duration': 132,
  'hilights': []},
 {'response_id': 'R_2EFfqG0VkHP6Eef',
  'recorded_datetime': '2020-09-18 09:01:08',
  'status': 'IP Address',
  'p

In [7]:
# Load the original source CSV that was used in creating the survey. This will used to check against the results.
# We want to make sure there is no discrepency in which IDs are referring to which texts.
source_df = pd.read_csv(SURVEY_SOURCE_DATA_PATH)
#source_df.reset_index(drop=False, inplace=True)

# Question IDs in the actual survey will use the 
#source_df["qid"] = source_df["index"]
#source_df.drop(labels=["index"], axis=1, inplace=True)
#qid_to_source_text = {i:text for i,text in zip(source_df["qid"].values, source_df["text"].values)}
#qid_to_unique_text_id = {qid:i for qid,i in zip(source_df["qid"].values, source_df["id"].values)}
source_df.head(10)

Unnamed: 0,id,text,bin_id,bin_size,block_id,block_size,block_sample
0,1169,smoking,1,988,1,247,5
1,3702,Phenylketonuriars62514952,1,988,1,247,5
2,3701,Phenylketonuriars5030860,1,988,1,247,5
3,3700,Phenylketonuriars5030859,1,988,1,247,5
4,3699,Phenylketonuriars5030856,1,988,1,247,5
5,3698,Phenylketonuriars5030851,1,988,1,247,5
6,3697,Phenylketonuriars5030850,1,988,1,247,5
7,3696,Phenylketonuriars5030847,1,988,1,247,5
8,3695,Phenylketonuriars5030846,1,988,1,247,5
9,3694,Phenylketonuriars5030843,1,988,1,247,5


In [19]:
# Put the responses into a dataframe.
row_tuples = []
for response in responses:
    
    # Metadata associated with this response.
    response_id = response["response_id"]
    recorded_datetime = response["recorded_datetime"]
    status = response["status"]
    progress = response["progress"]
    is_finished = response["is_finished"]
    duration = response["duration"]


    # The actual highlighted text strings from this response.
    for hilight in response["hilights"]:

        # The information about this one particular highlight.
        qid = hilight["qid"]
        hilighted_text = hilight["selection"]
        index_of_first_selected_char = hilight["selection_index"]
        source_text = unique_text_id_to_text[qid]        

        # First check that the question IDs are correct so we know what the source text was for this question.
        # Then additionally make sure the location of the highlight makes sense as well.
        assert hilighted_text in source_text
        #assert source_text[index_of_first_selected_char:index_of_first_selected_char+len(hilighted_text)] == hilighted_text
        indices_match = (source_text[index_of_first_selected_char:index_of_first_selected_char+len(hilighted_text)] == hilighted_text)
        
        
        # Add this as a row.
        row_tuples.append((response_id, recorded_datetime, status, progress, is_finished, duration, qid, hilighted_text, indices_match))

columns = ["response_id", "recorded_datetime", "status", "progress", "is_finished", "duration", "id", "snippet", "idx_match"]
df = pd.DataFrame(row_tuples, columns=columns)
df.to_csv(RESPONSES_OUTPUT_PATH, index=False)
df.head(20)

Unnamed: 0,response_id,recorded_datetime,status,progress,is_finished,duration,id,snippet,idx_match
0,R_ylqfW7N5mTbCAdr,2020-09-18 08:51:41,IP Address,100,True,57,5379,Von Willebrand disease,True
1,R_ylqfW7N5mTbCAdr,2020-09-18 08:51:41,IP Address,100,True,57,1088,Cystic Fibrosis,True
2,R_ylqfW7N5mTbCAdr,2020-09-18 08:51:41,IP Address,100,True,57,1087,ystic Fibrosi,True
3,R_ylqfW7N5mTbCAdr,2020-09-18 08:51:41,IP Address,100,True,57,4307,Hirschsprung disease,True
4,R_2EFfqG0VkHP6Eef,2020-09-18 09:01:08,IP Address,100,True,143,1883,Marfan syndrome,True
5,R_cHBI2Cg42MmRZSN,2020-09-18 09:06:27,IP Address,100,True,211,2164,gwas,True
6,R_cHBI2Cg42MmRZSN,2020-09-18 09:06:27,IP Address,100,True,211,2164,spatial recognition,True
7,R_cHBI2Cg42MmRZSN,2020-09-18 09:06:27,IP Address,100,True,211,916,leprosy,True
8,R_pN3Kc5bjVbkY4j7,2020-09-18 09:13:59,IP Address,100,True,388,1100,Cystic Fibrosis,True
9,R_pN3Kc5bjVbkY4j7,2020-09-18 09:13:59,IP Address,100,True,388,1080,Cystic Fibrosis,True


In [20]:
# Create a mapping from unique text IDs to all of the text snippets that were hilighted in these survey results.
unique_text_id_to_text_list = {}
for unique_text_id,row_indices in df.groupby("id", axis=0).groups.items():
    hilighted_texts_list = list(df.iloc[row_indices]["snippet"].values)
    unique_text_id_to_text_list[unique_text_id] = hilighted_texts_list
print(unique_text_id_to_text_list)

{22: ['later age at onset and shorter disease course', "Alzheimer's disease"], 112: ['diamine oxidase serum activities', 'lower DAO mRNA expressio', 'reduced DAO activity'], 248: [' or W278X)This recessively inherited mutation is considered the most com'], 276: ['hereditary fructosuria'], 304: ['pathogenic for the infantile form of hypophosphatasia'], 385: ['y be related to longevity. One study found that Ashkenazi who lived to '], 450: ['polypoidal choroidal vasculopathy'], 567: ['slightly lower (0.8-0.87x) risk for adult ADHD'], 568: [' testicular cance'], 713: ['breast cancer'], 741: ['pathogenic for breast cancer'], 748: ['breast cancer'], 772: ['pathogenic for breast cancer'], 879: ['pathogenic'], 900: ['rippling muscle disease', 'Novel missense mutation in the caveolin-3 gene in a Belgian family with rippling mu', 'cle disease.'], 916: ['leprosy'], 935: ['recurrent venous thromboembolism'], 948: ["Alzheimer's disease associated"], 986: ['pathogenic rare mutation for hereditary di

In [21]:
# Use that mapping to create a version of the original dataframe with just the hilighted text snippets.
subset_df = original_df[original_df["id"].isin(unique_text_id_to_text_list.keys())]
subset_df["n"] = subset_df["id"].map(lambda x: len(unique_text_id_to_text_list[x]))
text_snippets = flatten([unique_text_id_to_text_list[i] for i in subset_df["id"].values])

# Extend the dataframe to duplicate each row n times where n is the number of text hilight results from the surveys.
modified_df = subset_df.reindex(np.repeat(subset_df.index.values, subset_df["n"]), method="ffill")

# Make sure that the extension occured as expected based on the number of text snippets, and add them as a new column.
assert len(modified_df) == len(text_snippets)
modified_df["snippet"] = text_snippets
modified_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,gene,snp,text,id,n,snippet
25,ABCA7,Rs3764650,"Alzheimer's disease associated, based on large...",22.0,2,later age at onset and shorter disease course
25,ABCA7,Rs3764650,"Alzheimer's disease associated, based on large...",22.0,2,Alzheimer's disease
130,ABP1,Rs2268999,PMID 21488903 Association of single nucleotide...,112.0,3,diamine oxidase serum activities
130,ABP1,Rs2268999,PMID 21488903 Association of single nucleotide...,112.0,3,lower DAO mRNA expressio
130,ABP1,Rs2268999,PMID 21488903 Association of single nucleotide...,112.0,3,reduced DAO activity
273,AIPL1,Rs62637014,aka c.834G>A (p.Trp278Ter or W278X)This recess...,248.0,1,or W278X)This recessively inherited mutation ...
304,ALDOB,I5012663,rs1800546hereditary fructosuria,276.0,1,hereditary fructosuria
332,ALPL,I5002774,"i5002774, also known as c.211C>T or p.R71C, is...",304.0,1,pathogenic for the infantile form of hypophosp...
393,AOC1,Rs2268999,PMID 21488903 Association of single nucleotide...,112.0,3,diamine oxidase serum activities
393,AOC1,Rs2268999,PMID 21488903 Association of single nucleotide...,112.0,3,lower DAO mRNA expressio


In [23]:
# Save that dataframe as a new CSV file with just the final cleaned text snippets for each gene and SNP.
modified_df = modified_df[["gene","snp","snippet"]]
modified_df.to_csv(SNPS_AND_SNIPPETS_OUTPUT_PATH, index=False)
modified_df.tail(20)

Unnamed: 0,gene,snp,snippet
5588,SNAP25,Rs362988,eastern Indian attention deficit hyperactivity...
5609,SNX9,Rs78503206,thinking cilantro tastes like soap
5633,SORL1,Rs1784931,reased (or decreased) risk for Alzheimer's dis...
5650,SOX5,Rs10842262,associated with non-obstructive azoospermia
5655,SPIB,Rs3745516,1.46 times higher odds of primary biliary cirr...
5670,SRD5A2,Rs523349,ovarian cancer
5680,SSUH2,Rs116840785,rippling muscle disease
5680,SSUH2,Rs116840785,Novel missense mutation in the caveolin-3 gene...
5680,SSUH2,Rs116840785,cle disease.
5984,TNNI3,I5007731,familial hypertrophic cardiomyopathy
