### Make Final Datasets
The purpose of this notebook is to create the final datasets of snippets and contexts by combining output files from multiple survey responses here. The paths to the output files in the data directory that should be row stacked are defined here and the final files are created.

In [5]:
from pathlib import Path
import pandas as pd
import random
import os
import datetime

In [6]:
def rowstack_dataframes(paths, outfile_path):
    dfs = []
    for path in paths:
        dfs.append(pd.read_csv(path))
    df = pd.concat(dfs)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(outfile_path, index=False, sep="\t")
    return(df)

In [7]:
# Create the full dataset of genes and text snippets.
name = "../genes_snps_snippets.tsv"
paths = [
    "../data/S301881_snps_and_snippets_09_25_2020_h23m31s22.csv", # First survey
    "../data/S195681_snps_and_snippets_10_01_2020_h23m32s48.csv", # Second survey
]
df = rowstack_dataframes(paths,name)

In [10]:
df.head()

Unnamed: 0,gene,snp,snippet
0,AANAT,Rs3760138,autism
1,ABCA1,Rs2230808,Alzheimer's disease
2,ABCA1,Rs2230808,polymorphisms and Alzheimer's disease.
3,ABCA1,Rs2230808,schizophrenia and related brain changes.
4,ABCA1,Rs2230808,polymorphism


In [11]:
df.shape

(22767, 3)

In [12]:
len(pd.unique(df["gene"]))

1251

In [13]:
# Create the full dataset of genes and context sentences.
name = "../genes_snps_contexts.tsv"
paths = [
    "../data/S301881_snps_and_contexts_09_25_2020_h23m31s22.csv", # First survey
    "../data/S195681_snps_and_contexts_10_01_2020_h23m32s48.csv", # Second survey
]
df = rowstack_dataframes(paths,name)

In [14]:
df.head()

Unnamed: 0,gene,snp,context
0,AANAT,Rs3760138,Examination of association of genes in the ser...
1,ABCA1,Rs2297404,"rs2297404, rs2230808, and rs2020927 haplotype ..."
2,ABCA1,Rs2230808,A polymorphism of the ABCA1 gene confers susce...
3,ABCA1,Rs2230806,Increase in HDL-C concentration by a dietary p...
4,ABCA1,Rs4149268,G allele is associated with 0.82mg/dl increase...


In [15]:
df.shape

(5918, 3)

In [16]:
len(pd.unique(df["gene"]))

1247