In [1]:
import pandas as pd
import numpy as np
import json
import random
import datetime
from utils import flatten

In [2]:
# JSON file from parsing the results of the qualtrics survey designed for extracting phenotypes from text.
QUALTRICS_RESULTS_JSON_PATH = "../qualtrics/results_json_files/some_new_results.json"

# What the survey was built from, the IDs in this refer to unique text strings so that duplicates weren't part of it.
SURVEY_SOURCE_DATA_PATH = "../data/4_binned_and_blocked_texts.csv"

# This is the original data with duplicate text strings because IDs refer to particular SNPs from particular genes.
ORIGINAL_DATA_PATH = "../data/2_snps_and_cleaned_text.csv"

# Where to send the resulting output dataframes as CSV files.
random_six_digit_number = random.randrange(100000,999999)
datetime_str = datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S')
key = "{}_{}".format(random_six_digit_number, datetime_str)

RESPONSES_OUTPUT_PATH = "../data/5_processed_survey_responses_{}.csv".format(key)
SNPS_AND_SNIPPETS_OUTPUT_PATH = "../data/6_snps_and_snippets_{}.csv".format(key)

In [3]:
# Make sure the mapping between the unique texts used to create the survey and the original SNP data exists.
original_df = pd.read_csv(ORIGINAL_DATA_PATH)
survey_df = pd.read_csv(SURVEY_SOURCE_DATA_PATH)
text_to_unique_text_id = dict(zip(survey_df.text, survey_df.id))
unique_text_id_to_text = dict(zip(survey_df.id, survey_df.text))
original_df["id"] = original_df["text"].map(text_to_unique_text_id)
original_df.head(20)

Unnamed: 0,gene,snp,text,id
0,AANAT,Rs28936679,"rs28936679, also known as Ala129Thr or A129T (...",1.0
1,AANAT,Rs3760138,Genetic differences in human circadian clock g...,2.0
2,AANAT,Rs4238989,Genetic differences in human circadian clock g...,3.0
3,ABCA1,Rs1800977,The -14C->T polymorphism rs1800977 of the ABCA...,4.0
4,ABCA1,Rs1883025,Apolipoprotein E levels in cerebrospinal fluid...,5.0
5,ABCA1,Rs2020927,"rs2297404, rs2230808, and rs2020927 haplotype ...",6.0
6,ABCA1,Rs2066714,Apolipoprotein E levels in cerebrospinal fluid...,7.0
7,ABCA1,Rs2066715,Apolipoprotein E levels in cerebrospinal fluid...,8.0
8,ABCA1,Rs2230806,"rs2230806, also known as Arg219Lys or R219K, i...",9.0
9,ABCA1,Rs2230808,"rs2297404, rs2230808, and rs2020927 haplotype ...",10.0


In [10]:
# Create a dictionary using the JSON data output from a qualtrics survey.
with open(QUALTRICS_RESULTS_JSON_PATH) as f:
    responses = json.load(f)
responses

[{'response_id': 'R_1IvdZOg9fDRD6vW',
  'recorded_datetime': '2020-09-16 13:13:20',
  'status': 'Survey Preview',
  'progress': 100,
  'is_finished': 'True',
  'duration': 18,
  'hilights': [{'qid': 244, 'selection': '131240.0003', 'selection_index': 10},
   {'qid': 314, 'selection': '23andMe name', 'selection_index': 0},
   {'qid': 548, 'selection': 'glomerulosclerosis', 'selection_index': 16},
   {'qid': 926, 'selection': 'Niemann-Pick Disease', 'selection_index': 35},
   {'qid': 1080, 'selection': '9Ala)23andMe nam', 'selection_index': 17}]},
 {'response_id': 'R_SJNBFBJwcA8l6VP',
  'recorded_datetime': '2020-09-17 11:02:58',
  'status': 'Survey Preview',
  'progress': 100,
  'is_finished': 'True',
  'duration': 84,
  'hilights': [{'qid': 3695,
    'selection': 'ylketonuriars50',
    'selection_index': 4},
   {'qid': 3902, 'selection': 'PKHD1', 'selection_index': 15},
   {'qid': 3227, 'selection': 'c.271dupA', 'selection_index': 4},
   {'qid': 1397, 'selection': 'hyperplasiars1513445

In [5]:
# Load the original source CSV that was used in creating the survey. This will used to check against the results.
# We want to make sure there is no discrepency in which IDs are referring to which texts.
source_df = pd.read_csv(SURVEY_SOURCE_DATA_PATH)
#source_df.reset_index(drop=False, inplace=True)

# Question IDs in the actual survey will use the 
#source_df["qid"] = source_df["index"]
#source_df.drop(labels=["index"], axis=1, inplace=True)
#qid_to_source_text = {i:text for i,text in zip(source_df["qid"].values, source_df["text"].values)}
#qid_to_unique_text_id = {qid:i for qid,i in zip(source_df["qid"].values, source_df["id"].values)}
source_df.head(10)

Unnamed: 0,id,text,bin_id,bin_size,block_id,block_size,block_sample
0,3710,Phenylketonuriars118203925,1,989,1,248,5
1,3711,Phenylketonuriars62516095,1,989,1,248,5
2,3712,Phenylketonuriars62508588,1,989,1,248,5
3,1304,schizophrenia|,1,989,1,248,5
4,1169,smoking,1,989,1,248,5
5,3708,Phenylketonuriars62516092,1,989,1,248,5
6,3517,gs252,1,989,1,248,5
7,3639,freckling,1,989,1,248,5
8,3642,colorblindness,1,989,1,248,5
9,540,c.348-9_351del,1,989,1,248,5


In [6]:
# Put the responses into a dataframe.
row_tuples = []
for response in responses:
    
    # Metadata associated with this response.
    response_id = response["response_id"]
    recorded_datetime = response["recorded_datetime"]
    status = response["status"]
    progress = response["progress"]
    is_finished = response["is_finished"]
    duration = response["duration"]
    
    # The actual highlighted text strings from this response.
    for hilight in response["hilights"]:
        
        # The information about this one particular highlight.
        qid = hilight["qid"]
        hilighted_text = hilight["selection"]
        index_of_first_selected_char = hilight["selection_index"]
        source_text = unique_text_id_to_text[qid]        
        
        # First check that the question IDs are correct so we know what the source text was for this question.
        # Then additionally make sure the location of the highlight makes sense as well.
        assert hilighted_text in source_text
        assert source_text[index_of_first_selected_char:index_of_first_selected_char+len(hilighted_text)] == hilighted_text
    
        # Add this as a row.
        row_tuples.append((response_id, recorded_datetime, status, progress, is_finished, duration, qid, hilighted_text))
        
columns = ["response_id", "recorded_datetime", "status", "progress", "is_finished", "duration", "id", "snippet"]
df = pd.DataFrame(row_tuples, columns=columns)
df.to_csv(RESPONSES_OUTPUT_PATH, index=False)
df.head(20)

Unnamed: 0,response_id,recorded_datetime,status,progress,is_finished,duration,id,snippet
0,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3695,ylketonuriars50
1,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3902,PKHD1
2,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3227,c.271dupA
3,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,1397,hyperplasiars151344504
4,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,1111,Cystic Fibrosisrs
5,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3352,homocystinuria
6,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3352,Smoking and having low folate level
7,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3352,migraine
8,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3352,cardiovascular disease
9,R_SJNBFBJwcA8l6VP,2020-09-17 11:02:58,Survey Preview,100,True,84,3352,Genetic impairments in folate enzymes


In [7]:
# Create a mapping from unique text IDs to all of the text snippets that were hilighted in these survey results.
unique_text_id_to_text_list = {}
for unique_text_id,row_indices in df.groupby("id", axis=0).groups.items():
    hilighted_texts_list = list(df.iloc[row_indices]["snippet"].values)
    unique_text_id_to_text_list[unique_text_id] = hilighted_texts_list
print(unique_text_id_to_text_list)

{1111: ['Cystic Fibrosisrs'], 1397: ['hyperplasiars151344504'], 3227: ['c.271dupA'], 3352: ['homocystinuria', 'Smoking and having low folate level', 'migraine', 'cardiovascular disease', 'Genetic impairments in folate enzymes'], 3695: ['ylketonuriars50'], 3902: ['PKHD1']}


In [8]:
# Use that mapping to create a version of the original dataframe with just the hilighted text snippets.
subset_df = original_df[original_df["id"].isin(unique_text_id_to_text_list.keys())]
subset_df["n"] = subset_df["id"].map(lambda x: len(unique_text_id_to_text_list[x]))
text_snippets = flatten([unique_text_id_to_text_list[i] for i in subset_df["id"].values])

# Extend the dataframe to duplicate each row n times where n is the number of text hilight results from the surveys.
modified_df = subset_df.reindex(np.repeat(subset_df.index.values, subset_df["n"]), method="ffill")

# Make sure that the extension occured as expected based on the number of text snippets, and add them as a new column.
assert len(modified_df) == len(text_snippets)
modified_df["snippet"] = text_snippets
modified_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,gene,snp,text,id,n,snippet
1199,CFTR,I5006053,Cystic Fibrosisrs121908757,1111.0,1,Cystic Fibrosisrs
1517,CYP21A2,I5005431,Congenital adrenal hyperplasiars151344504,1397.0,1,hyperplasiars151344504
3821,MMACHC,Rs398124292,aka c.271dupA,3227.0,1,c.271dupA
3995,MTHFR,Rs1801133,rs1801133 is a SNP that is relatively common a...,3352.0,5,homocystinuria
3995,MTHFR,Rs1801133,rs1801133 is a SNP that is relatively common a...,3352.0,5,Smoking and having low folate level
3995,MTHFR,Rs1801133,rs1801133 is a SNP that is relatively common a...,3352.0,5,migraine
3995,MTHFR,Rs1801133,rs1801133 is a SNP that is relatively common a...,3352.0,5,cardiovascular disease
3995,MTHFR,Rs1801133,rs1801133 is a SNP that is relatively common a...,3352.0,5,Genetic impairments in folate enzymes
4398,PAH,I3003398,Phenylketonuriars5030846,3695.0,1,ylketonuriars50
4618,PKHD1,I5000041,rs137852949see PKHD1,3902.0,1,PKHD1


In [9]:
# Save that dataframe as a new CSV file with just the final cleaned text snippets for each gene and SNP.
modified_df = modified_df[["gene","snp","snippet"]]
modified_df.to_csv(SNPS_AND_SNIPPETS_OUTPUT_PATH, index=False)
modified_df.head(20)

Unnamed: 0,gene,snp,snippet
1199,CFTR,I5006053,Cystic Fibrosisrs
1517,CYP21A2,I5005431,hyperplasiars151344504
3821,MMACHC,Rs398124292,c.271dupA
3995,MTHFR,Rs1801133,homocystinuria
3995,MTHFR,Rs1801133,Smoking and having low folate level
3995,MTHFR,Rs1801133,migraine
3995,MTHFR,Rs1801133,cardiovascular disease
3995,MTHFR,Rs1801133,Genetic impairments in folate enzymes
4398,PAH,I3003398,ylketonuriars50
4618,PKHD1,I5000041,PKHD1
