In [1]:
import pandas as pd
import seaborn as sn
import math
import re
import os
from nltk import word_tokenize

In [2]:
# Takes
INPUT_PATH = os.path.join("..","data", "2_snps_and_cleaned_text.csv")

# Creates
ORIGINAL_DATA_WITH_IDS_PATH = os.path.join("..","data","3_snps_and_cleaned_text_with_ids.csv")
IDS_AND_UNIQUE_TEXTS_PATH = os.path.join("..","data","4_ids_and_unique_texts.csv")

### Preparing mappings between text datasets
The purpose of this section is to load the dataset of cleaned texts, and then create a new dataset that contains just the texts we want to actually send to a survey. This means getting rid of texts with very short strings that we know won't get any annotations, and also getting rid of duplicate texts because we know that any annotations that apply to one of these would apply to all of them, so we can eliminate this redundancy on the survey side. In creating this new dataset, it is important to retain a mapping back to the original gene and SNPs that each text originates from, so that we can move the annotations resulting from the surveys all the way back to the biological data we want to associate it with. 

In [3]:
original_df = pd.read_csv(INPUT_PATH)
original_df.head(10)

Unnamed: 0,gene,snp,text
0,AANAT,Rs28936679,"rs28936679, also known as Ala129Thr or A129T (..."
1,AANAT,Rs3760138,Genetic differences in human circadian clock g...
2,AANAT,Rs4238989,Genetic differences in human circadian clock g...
3,ABCA1,Rs1800977,The -14C->T polymorphism rs1800977 of the ABCA...
4,ABCA1,Rs1883025,Apolipoprotein E levels in cerebrospinal fluid...
5,ABCA1,Rs2020927,"rs2297404, rs2230808, and rs2020927 haplotype ..."
6,ABCA1,Rs2066714,Apolipoprotein E levels in cerebrospinal fluid...
7,ABCA1,Rs2066715,Apolipoprotein E levels in cerebrospinal fluid...
8,ABCA1,Rs2230806,"rs2230806, also known as Arg219Lys or R219K, i..."
9,ABCA1,Rs2230808,"rs2297404, rs2230808, and rs2020927 haplotype ..."


In [4]:
original_df.shape

(6535, 3)

In [5]:
# Prepare a dataframe that has just the unique text strings and a unique ID for each of those texts.
original_df["text"] = original_df["text"].map(lambda x: x.replace("*"," "))
df = pd.DataFrame(pd.unique(original_df["text"]), columns=["text"])
df["id"] = range(1,df.shape[0]+1)
df = df[["id","text"]]
df.to_csv(IDS_AND_UNIQUE_TEXTS_PATH, index=False)
df.head(10)

Unnamed: 0,id,text
0,1,"rs28936679, also known as Ala129Thr or A129T (..."
1,2,Genetic differences in human circadian clock g...
2,3,Genetic differences in human circadian clock g...
3,4,The -14C->T polymorphism rs1800977 of the ABCA...
4,5,Apolipoprotein E levels in cerebrospinal fluid...
5,6,"rs2297404, rs2230808, and rs2020927 haplotype ..."
6,7,Apolipoprotein E levels in cerebrospinal fluid...
7,8,Apolipoprotein E levels in cerebrospinal fluid...
8,9,"rs2230806, also known as Arg219Lys or R219K, i..."
9,10,"rs2297404, rs2230808, and rs2020927 haplotype ..."


In [6]:
# Add a thing here that saves the mapping between IDs and row indices in the 1_snps_and_scraped_text so we can get 
# back to it later. Then that will get updated in the the fourth notebook that builds the final dataset files.
print(original_df.shape)
print(df.shape)
original_df_with_ids = original_df.copy(deep=True)
original_df_with_ids["id"] = original_df_with_ids["text"].map(lambda x: dict(zip(df["text"].values, df["id"].values))[x])
original_df_with_ids.head(10)

(6535, 3)
(5451, 2)


Unnamed: 0,gene,snp,text,id
0,AANAT,Rs28936679,"rs28936679, also known as Ala129Thr or A129T (...",1
1,AANAT,Rs3760138,Genetic differences in human circadian clock g...,2
2,AANAT,Rs4238989,Genetic differences in human circadian clock g...,3
3,ABCA1,Rs1800977,The -14C->T polymorphism rs1800977 of the ABCA...,4
4,ABCA1,Rs1883025,Apolipoprotein E levels in cerebrospinal fluid...,5
5,ABCA1,Rs2020927,"rs2297404, rs2230808, and rs2020927 haplotype ...",6
6,ABCA1,Rs2066714,Apolipoprotein E levels in cerebrospinal fluid...,7
7,ABCA1,Rs2066715,Apolipoprotein E levels in cerebrospinal fluid...,8
8,ABCA1,Rs2230806,"rs2230806, also known as Arg219Lys or R219K, i...",9
9,ABCA1,Rs2230808,"rs2297404, rs2230808, and rs2020927 haplotype ...",10


In [7]:
# Make sure that the number of IDs listed in this copy of the original data is the same as in the unique text frame.
# Then save the version of the original cleaned texts for each gene and SNP with those IDs referring to unique texts.
assert len(pd.unique(original_df_with_ids["id"])) == len(pd.unique(df["id"]))
original_df_with_ids.to_csv(ORIGINAL_DATA_WITH_IDS_PATH, index=False)
original_df.shape

(6535, 3)