In [20]:
import pandas as pd
from sqlalchemy import create_engine

# Function to read VCF file into DataFrame


def read_vcf(file: str) -> pd.DataFrame:
    num_header = 0
    with open(file) as f:
        for line in f:
            if line.startswith("##"):
                num_header += 1
            elif line.startswith("#"):
                num_header += 1
                break
    # Read the VCF, assuming the column names are on the last line of the headers
    vcf = pd.read_csv(file, sep="\t", skiprows=num_header, dtype=str)
    vcf = vcf.rename(columns={"#CHROM": "CHROM"})
    return vcf

# Function to upload DataFrame to PostgreSQL


def upload_to_postgres(df: pd.DataFrame, table_name: str, database_url: str):
    # Create SQLAlchemy engine
    engine = create_engine(database_url)
    # Upload data to PostgreSQL table
    df.to_sql(table_name, engine, if_exists='replace',
              index=False, method='multi', chunksize=1000)
    print(f"Data uploaded to table '{table_name}' in the PostgreSQL database.")


# Example usage
database_url = 'postgresql://postgres:admin@localhost:5432/biologicalsamples'
table_name = 'genomic_data'
file_path = 'GBS_filtered_SNPs.vcf'
df = read_vcf(file_path)
upload_to_postgres(df, table_name, database_url)

Data uploaded to table 'genomic_data' in the PostgreSQL database.


In [24]:
df.shape

(210651, 67)

In [18]:
df["INFO"].apply(lambda x: x[x.index("AF="):].split(";")[
                 0].split("=")[1] if "AF=" in x else ".")

0         0.196
1         0.304
2         0.477
3         0.477
4         0.489
          ...  
210647    0.040
210648    0.040
210649    0.040
210650    0.068
210651    0.043
Name: INFO, Length: 210652, dtype: object