In [0]:
%sh
aws s3 cp ~/cavatica/projects/yiran/pnoc008-annovar-annotation/  s3://kf-strides-variant-parquet-prd/notebooks/ad7a1e3b-f732-41c4-be11-f6938f4323e5/ --recursive --exclude "*" --include "hg38_HGMD2022Q4*txt"

In [1]:
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.BooleanType

def loadCustomTable(tableName:String) = {
    spark.read.parquet(myFile(s"tables/$tableName")).createOrReplaceTempView(tableName)
}
def myFile(f:String) = s"s3a://kf-strides-variant-parquet-prd/notebooks/ad7a1e3b-f732-41c4-be11-f6938f4323e5/$f"

In [2]:
// # %sh
// # head ~/cavatica/projects/yiran/pnoc008-annovar-annotation/hg38_HGMD2022Q4_gene_lite.VWB.txt

spark.read.option("sep", "\t")
    .option("header", "true")
    .csv(myFile("hg38_HGMD2022Q4_gene_lite.txt"))
    .withColumnRenamed("Chr", "chromosome")
    .withColumn("chromosome", regexp_replace($"chromosome", "chr", ""))
    .withColumn("start", $"Start".cast(LongType))
    .withColumn("end", $"End".cast(LongType))
    .withColumn("split_c0",split($"#EntrezGeneID_GeneSymbol", "_"))
    .withColumn("entrez_gene_id",$"split_c0"(0))
    .withColumn("symbol",$"split_c0"(1))
    .withColumn("variant_class",split($"Phenotypes", ","))
    .drop("Chr", "Phenotypes", "split_c0", "#EntrezGeneID_GeneSymbol")
    .coalesce(1)
    .write.mode("overwrite")
    .parquet(myFile(s"tables/hg38_HGMD2022Q4_gene_lite"))
    
loadCustomTable("hg38_HGMD2022Q4_gene_lite")

// %sql
// SELECT * FROM hg38_HGMD2022Q4_gene_lite LIMIT 5

In [3]:
// %sh
// head ~/cavatica/projects/yiran/pnoc008-annovar-annotation/hg38_HGMD2022Q4_gene_sorted.VWB.txt

spark.read.option("sep", "\t")
    .option("header", "true")
    .csv(myFile("hg38_HGMD2022Q4_gene_sorted.txt"))
    .withColumnRenamed("Chr", "chromosome")
    .withColumn("chromosome", regexp_replace($"chromosome", "chr", ""))
    .withColumn("start", $"Start".cast(LongType))
    .withColumn("end", $"End".cast(LongType))
    .withColumn("split_c0",split($"#EntrezGeneID_GeneSymbol", "_"))
    .withColumn("entrez_gene_id",$"split_c0"(0))
    .withColumn("symbol",$"split_c0"(1))
    .withColumn("split_c4",split($"Phenotypes", "~"))
    .withColumn("DM", split(regexp_extract($"split_c4"(0), "^DM\\[([^\\]]*)?\\]?", 1), ","))
    .withColumn("DM?", split(regexp_extract($"split_c4"(1), "^DM\\?\\[([^\\]]*)?\\]?", 1), ","))
    .withColumn("DP", split(regexp_extract($"split_c4"(2), "^DP\\[([^\\]]*)?\\]?", 1), ","))
    .withColumn("DFP", split(regexp_extract($"split_c4"(3), "^DFP\\[([^\\]]*)?\\]?", 1), ","))
    .withColumn("FP", split(regexp_extract($"split_c4"(4), "^FP\\[([^\\]]*)?\\]?", 1), ","))
    .withColumn("R", split(regexp_extract($"split_c4"(5), "^R\\[([^\\]]*)?\\]?", 1), ","))
    .drop("Chr", "Phenotypes", "split_c0", "#EntrezGeneID_GeneSymbol", "split_c4")
    .coalesce(1)
    .write.mode("overwrite")
    .parquet(myFile(s"tables/hg38_HGMD2022Q4_gene_sorted")) 

loadCustomTable("hg38_HGMD2022Q4_gene_sorted")

// %sql
// SELECT * FROM hg38_HGMD2022Q4_gene_sorted LIMIT 5

In [4]:
spark.read.option("sep", "\t").option("header", "true")
    .option("nullValue", ".")
    .csv(myFile("hg38_HGMD2022Q4_variant.txt"))
    .withColumnRenamed("#Chr", "chromosome")
    .withColumn("chromosome", regexp_replace($"chromosome", "chr", ""))
    .withColumn("start", $"Start".cast(LongType))
    .withColumn("end", $"End".cast(LongType))
    .withColumnRenamed("Ref", "reference")
    .withColumnRenamed("Alt", "alternate")
    .withColumnRenamed("HGMD2022Q4_ID", "id")
    .withColumnRenamed("HGMD2022Q4_CLASS", "variant_class")
    .withColumnRenamed("HGMD2022Q4_MUT", "mut")
    .withColumnRenamed("HGMD2022Q4_GENE", "symbol")
    .withColumnRenamed("HGMD2022Q4_STRAND", "strand")
    .withColumnRenamed("HGMD2022Q4_DNA", "dna")
    .withColumnRenamed("HGMD2022Q4_PROT", "prot")
    .withColumnRenamed("HGMD2022Q4_DB", "db")
    .withColumnRenamed("HGMD2022Q4_PHEN", "phen")
    .withColumn("rankscore", $"HGMD2022Q4_RANKSCORE".cast(FloatType))
    .withColumn("variant_end", $"HGMD2022Q4_END".cast(LongType))
    .withColumn("svlen", $"HGMD2022Q4_SVLEN".cast(LongType))
    .coalesce(1)
    .write.mode("overwrite")
    .parquet(myFile(s"tables/hg38_HGMD2022Q4_variant"))        

loadCustomTable("hg38_HGMD2022Q4_variant")

In [5]:
%sh
for FILE in hg38_HGMD2022Q4_gene_lite hg38_HGMD2022Q4_gene_sorted hg38_HGMD2022Q4_variant; do
    mkdir $FILE
    aws s3 cp --recursive s3://kf-strides-variant-parquet-prd/notebooks/ad7a1e3b-f732-41c4-be11-f6938f4323e5/tables/$FILE $FILE/
    cp -r $FILE ~/cavatica/projects/yiran/variant-workbench-testing/
    rm -rf ~/$FILE
done