In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, min, max, when, split, col
from pyspark.sql.functions import concat, col, lit
import pyspark.sql.functions as F


### For spark section

In [2]:
# Initialize SparkSession
spark = SparkSession.builder.appName("txt to DataFrame").getOrCreate()

# Load CSV file into DataFrame
df = spark.read.csv("/homes/zhe/Desktop/programming/p5/exam/dbNSFP4.9a.txt.gz.SMALL",sep='\t', header=True, inferSchema=True)

# Show the DataFrame content
df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/14 09:31:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/14 09:31:25 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----+------------+---+---+-----+-----+------------+--------+-----------------+--------+-----------------+-----+--------+---------------+--------------------+-----------------+-----------+----------------+-------------+-------------+------------+------------+---------+---------+------+-------------+---+-------------+----------+--------+--------+----------------+----------------+---------------+--------+------------------+---------------------+----------+------------------------+---------+------------+--------------------------+-----------+--------------------+------------------------+-------------------+--------------------+------------------------+-------------------+---------+-----------------------+--------+---------+--------------------+----------------------------------+-------------------+--------------------+------------------+----------------------+--------------------------+---------------------+------------+--------------------------+-----------+-------------+----------------

### What are average (mean), minimum and maximum score for the column SIFT_score?

In [3]:
from pyspark.sql.functions import mean, min, max, when, split, col

# Replace '.' with null
df = df.withColumn("SIFT_score", when(col("SIFT_score") == ".", None).otherwise(col("SIFT_score")))

# Split multiple scores and take the first one
df = df.withColumn("SIFT_score", split(col("SIFT_score"), ";").getItem(0))

# Convert to double
df = df.withColumn("SIFT_score", col("SIFT_score").cast("double"))

# Recalculate statistics
sift_mean = df.select(mean("SIFT_score")).collect()[0][0]
sift_min = df.select(min("SIFT_score")).collect()[0][0]
sift_max = df.select(max("SIFT_score")).collect()[0][0]

print(f"Mean SIFT score: {sift_mean}")
print(f"Minimum SIFT score: {sift_min}")
print(f"Maximum SIFT score: {sift_max}")



                                                                                

Mean SIFT score: 0.09752028439983244
Minimum SIFT score: 0.0
Maximum SIFT score: 1.0


### Merge together the values from hg19_chr and hg19_pos(1-based) columns into a new column of strings called hg19_chr_pos with the format chr_pos and add it to your dataframe. Remove the original hg19_chr and hg19_pos(1-based) columns. [10 pts]

In [None]:
from pyspark.sql.functions import concat, col, lit

# Step 1 & 2: Merge values and create new column
df1 = df.withColumn("hg19_chr_pos", 
                   concat(col("hg19_pos(1-based)").cast("string"), lit("_"),col("hg19_chr") ))

# Step 3: Remove original columns
columns_to_drop = ["hg19_chr", "hg19_pos(1-based)"]
df1 = df1.drop(*columns_to_drop)

# Verify the changes
df1.select("hg19_chr_pos").show(5)
print(df1.columns)


+------------+
|hg19_chr_pos|
+------------+
|     65565_1|
|     65565_1|
|     65565_1|
|     65566_1|
|     65566_1|
+------------+
only showing top 5 rows

['#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'rs_dbSNP', 'hg18_chr', 'hg18_pos(1-based)', 'aapos', 'genename', 'Ensembl_geneid', 'Ensembl_transcriptid', 'Ensembl_proteinid', 'Uniprot_acc', 'Uniprot_entry', 'HGVSc_ANNOVAR', 'HGVSp_ANNOVAR', 'HGVSc_snpEff', 'HGVSp_snpEff', 'HGVSc_VEP', 'HGVSp_VEP', 'APPRIS', 'GENCODE_basic', 'TSL', 'VEP_canonical', 'cds_strand', 'refcodon', 'codonpos', 'codon_degeneracy', 'Ancestral_allele', 'AltaiNeandertal', 'Denisova', 'VindijiaNeandertal', 'ChagyrskayaNeandertal', 'SIFT_score', 'SIFT_converted_rankscore', 'SIFT_pred', 'SIFT4G_score', 'SIFT4G_converted_rankscore', 'SIFT4G_pred', 'Polyphen2_HDIV_score', 'Polyphen2_HDIV_rankscore', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_rankscore', 'Polyphen2_HVAR_pred', 'LRT_score', 'LRT_converted_rankscore', 'LRT_pred', 'LRT_

#### For which value of "codonpos" are the most effects predicted across all SNPs in the file? Does this make biological sense, argue why or why not. [15 pts]

In [5]:
pred_columns = [col for col in df.columns if col.endswith('_pred')]

print(pred_columns)
print(f"Number of prediction columns: {len(pred_columns)}")

df_pred = df.select(['codonpos']  + pred_columns)
df_pred.show(40)

['SIFT_pred', 'SIFT4G_pred', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_pred', 'LRT_pred', 'MutationTaster_pred', 'MutationAssessor_pred', 'FATHMM_pred', 'PROVEAN_pred', 'MetaSVM_pred', 'MetaLR_pred', 'MetaRNN_pred', 'M-CAP_pred', 'PrimateAI_pred', 'DEOGEN2_pred', 'BayesDel_addAF_pred', 'BayesDel_noAF_pred', 'ClinPred_pred', 'LIST-S2_pred', 'ESM1b_pred', 'EVE_Class10_pred', 'EVE_Class20_pred', 'EVE_Class25_pred', 'EVE_Class30_pred', 'EVE_Class40_pred', 'EVE_Class50_pred', 'EVE_Class60_pred', 'EVE_Class70_pred', 'EVE_Class75_pred', 'EVE_Class80_pred', 'EVE_Class90_pred', 'AlphaMissense_pred', 'Aloft_pred', 'fathmm-MKL_coding_pred', 'fathmm-XF_coding_pred']
Number of prediction columns: 35
+--------+---------+-----------+-------------------+-------------------+--------+-------------------+---------------------+-----------+------------+------------+-----------+------------+----------+--------------+------------+-------------------+------------------+-------------+------------+----------+-----

In [6]:
# Select the relevant columns
rankscore_columns = [col for col in df.columns if col.endswith('_rankscore')]

# Create the score_df
score_df = df.select(['codonpos']  + rankscore_columns)

# Create a new column 'avg_rankscore' to store the average of non-missing values
df_with_avg = score_df.withColumn(
    'avg_rankscore', 
    F.expr(f"""
        aggregate(
            array({', '.join([f"if(`{col}` != '.' AND `{col}` IS NOT NULL, cast(`{col}` as double), NULL)" for col in rankscore_columns])}),
            0D, 
            (acc, x) -> acc + coalesce(x, 0D), 
            acc -> acc / size(array({', '.join([f"`{col}`" for col in rankscore_columns])}))
        )
    """)
)

# Show the result
df2 = df_with_avg.select('codonpos', 'avg_rankscore')
df2.show(5)
# Get the row with the maximum avg_rankscore
df2.sort(df2["avg_rankscore"].desc()).show(10)


+--------+-------------------+
|codonpos|      avg_rankscore|
+--------+-------------------+
|       1|0.04117350877192984|
|       1|0.04025929824561404|
|       1|0.04101175438596492|
|       2|0.04809070175438597|
|       2|0.04925754385964913|
+--------+-------------------+
only showing top 5 rows

+--------------------+------------------+
|            codonpos|     avg_rankscore|
+--------------------+------------------+
|2;2;2;2;2;2;2;2;2...|0.6062366666666668|
|2;2;2;2;2;2;2;2;2...|0.5996326315789475|
|2;2;2;2;2;2;2;2;2...|0.5963924561403509|
|1;1;1;1;1;1;1;1;1...|0.5938461403508772|
|2;2;2;2;2;2;2;2;2...|0.5895950877192984|
|2;2;2;2;2;2;2;2;2...|0.5827424561403509|
|1;1;1;1;1;1;1;1;1...|0.5810947368421054|
|2;2;2;2;2;2;2;2;2...|0.5785180701754385|
|2;2;2;2;2;2;2;2;2...|0.5774235087719299|
|2;2;2;2;2;2;2;2;2...|0.5765431578947369|
+--------------------+------------------+
only showing top 10 rows



#### Codon Position 2 Shows the Most Predicted Effects:
Becuase it has the highest mean at all 57 predictions score.
This observation makes biological sense. The second position of a codon is often the most critical in determining the amino acid that will be incorporated into the protein. Changes in this position are highly likely to result in a different amino acid being coded for, which can significantly alter the protein's structure and function.
Biological Significance:

    The genetic code is structured in such a way that the second position of a codon is often the most informative about the chemical properties of the amino acid it encodes.
    Changes in the second position frequently lead to more dramatic shifts in amino acid properties (e.g., from a hydrophobic to a hydrophilic amino acid) compared to changes in other positions