In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from statsmodels.formula.api import ols

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
prs = pd.read_table('DYX_auto.sscore')
prs.rename(columns={'SCORE1_AVG': 'PRS'}, inplace=True)
ukb = pd.read_csv("merged.csv")
ukb['status'] = ukb.loc[:, 'dosage'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Create the histogram
plt.figure(figsize=(10, 6))

# Plot the main histogram
n, bins, patches = plt.hist(prs['SCORE1_AVG'], bins=30, edgecolor='black', alpha=0.7, color='#9cff33')

# Calculate the 10th and 90th percentiles
p10 = np.percentile(prs['SCORE1_AVG'], 10)
p90 = np.percentile(prs['SCORE1_AVG'], 90)

# Add vertical lines with different colors and styles
plt.axvline(x=p10, color='red', linestyle='--', label='10th percentile')
plt.axvline(x=p90, color='blue', linestyle='--', label='90th percentile')

# Customize the plot
plt.title('Distribution of PRS')
plt.xlabel('PRS')
plt.ylabel('Frequency')
plt.legend()

# Show the plot
plt.grid(True, alpha=0.3)
# plt.savefig('./figures/prs_distribution.png')
plt.show()

In [None]:
covariates = {
    # General
    'eid': 'IID', 
    'p31': 'sex',
    'p21022': 'Age at recruitment',
    'p22189': 'TDI',
    'p22006': 'Genetic ethnic grouping',
    # Category 120: Numeric memory (Cognitive function online)
    '20240': 'Maximum digits remembered correctly',
    # Category 1358: Broken letter recognition
    '20139': 'Number of letters correctly identified',
       
    # Category 100027: Fluid intelligence / reasoning
    '20016': 'Fluid intelligence score',
    
    # Category 100029: Numeric memory
    '4282': 'Maximum digits remembered correctly',

    # Category 100032: Reaction time
    '20023': 'Mean time to correctly identify matches',

    # Category 503: Tower rearranging
    '21004': 'Number of puzzles correct',
    # Category 504: Picture vocabulary
    '26302': 'Specific cognitive ability (AS)',
    '6364': 'Vocabulary level',
    # Category 501: Matrix pattern completion
    '6373': 'Number of puzzles correctly solved',
}

In [None]:
def calculate_means(covariates, df=ukb):
    means = {}
    for description in covariates.values():
        if description not in ('IID', 'sex', 'TDI'):
            base_col_name = description.replace(' ', '_')
            temp_cols = df.columns[df.columns.str.startswith(base_col_name)]
            mean = df[temp_cols].mean(axis=1)
            means[f"mean_{base_col_name}"] = mean
    return means

merged_means = pd.concat([ukb, pd.DataFrame(calculate_means(covariates))], axis=1)

descriptive = ['IID', 'status', 'sex', 'TDI']
mean_cols = list(merged_means.columns[merged_means.columns.str.startswith('mean')])
cols = descriptive + mean_cols
ukb_means = merged_means[cols].dropna(axis=1, how='all')

df = ukb_means.merge(prs[['IID', 'PRS']], how='inner')

In [None]:
fields = ["Number of word pairs correctly associated", 
          "Maximum digits remembered correctly", 
          "Number of letters correctly identified"]

dfs = {}
for field in fields:
    

    phenotype = 'mean_' + field.replace(' ', '_')

    ukb_means = merged_means[cols].dropna(axis=1, how='all')
    df = ukb_means[['IID', 'status', f'{phenotype}']].merge(prs[['IID', 'PRS']], how='inner')

    p10 = np.percentile(df['PRS'], 10)
    p90 = np.percentile(df['PRS'], 90)

    def calculate_prs_percentile(x):
        if x >= p90:
            return 'top_10%'
        elif x <= p10:
            return 'bottom_10%'
        else:
            return 'middle_80%'

    df['PRS_percentile'] = df['PRS'].apply(calculate_prs_percentile)
    df.dropna(inplace=True)


    df_filtered = df[df['PRS_percentile'].isin(['top_10%', 'bottom_10%'])].copy()

    df_filtered['PRS_percentile'] = df_filtered['PRS_percentile'].astype('category')
    df_filtered['status'] = df_filtered['status'].astype('category')

    print(df_filtered.groupby(['PRS_percentile', 'status']).size())
    print("-"*100)

    model = ols(
        f"{phenotype} ~ C(PRS_percentile) * C(status)",
        data=df_filtered
    ).fit()
    
    print(f"{field}:")
    anova_results = sm.stats.anova_lm(model, typ=2)
    dfs[f'{field}'] = anova_results
    print(anova_results)
    print("-"*100)

In [None]:
def run_anova(phenotype, df):
    model = ols(
        f"{phenotype} ~ C(PRS_percentile) * C(status)",
        data=df
    ).fit()
    anova_results = sm.stats.anova_lm(model, typ=2)
    return anova_results