# Import the data

In [None]:
import pandas as pd

df = pd.read_csv('../phenotypes_202211021922.csv', delimiter=';')
df

# Create dictionary with name of column as key and count of non dash values as values

In [None]:
# exclude first 5 columns
ignore_cols = ["user_id", "genotype_filename", "date_of_birth", "chrom_sex", "openhumans_name"]

#create dictionary 
non_dash_counts = {}

#loop through each column 
for i, col in enumerate(df.columns):
    
    dash_counter = 0

    if col not in ignore_cols:
        #loop through each value and add 1 to counter if value is a dash
        for j, value in enumerate(df[col].tolist()):
            if value == "-":
                dash_counter+=1
        #subtract counter from lenght of dataframe to generate number of non_dash_counts       
        non_dash_counts[col] = len(df[col].tolist())-dash_counter

non_dash_counts

# Maximum value in dictionary

In [None]:
#max value from non_dash_counts dictionary
max_value = max(non_dash_counts.values())


#key to max_value
key_max = max(non_dash_counts, key= lambda x: non_dash_counts[x])


# 40 columns with the most non dash values

In [None]:
import heapq
from heapq import nlargest

#keys of 40 highest values
max_40_values = heapq.nlargest(40, non_dash_counts, key = non_dash_counts.get)



# Number of non dash values of five columns, which are used for further analyse

In [None]:
# 5 out of the 40 keys got picked and values got listed
Lactose_intolerance = non_dash_counts.get("Lactose intolerance")
print(Lactose_intolerance)

Nicotine_dependence = non_dash_counts.get("Nicotine dependence")
print(Nicotine_dependence)

Blood_type = non_dash_counts.get("Blood type")
print(Blood_type)

Aspirin_Allergy = non_dash_counts.get("Aspirin Allergy")
print(Aspirin_Allergy)

Haplogroup = non_dash_counts.get("mtDNA Haplogroup (PhyloTree)")
print(Haplogroup)

# Phenotype characterization

# Lactose intolerance

###  I'm lactose tolerant -> 0
###  I'm lactose intolerant -> 1



In [None]:
#dataframe to list
LI = df["Lactose intolerance"].to_list()

#count unique values in LI list
print(set(LI))

#classification of values for lactose tolerant people
lactose_tolerant = ["Lactose tolerant", "lactose-tolerant", "False", "lactose tolerant", "No"]

#classification of values for lactose tolerant, lactose intolerant people and people with no entry
for i, entry in enumerate(LI):
    
    if entry in lactose_tolerant:
        print(df["genotype_filename"][i],"--> I'm lactose tolerant")

    elif entry == "-":
        print(df["genotype_filename"][i],"no value")

    else:
        print(df["genotype_filename"][i],"--> I'm lactose intolerant")



# Store genotype filename in array

In [None]:
#arrays to append genotype filename regarding phenotype classification
Lactose_Tolerant = []
Lactose_no_value = []
Lactose_Intolerant = []

for i, entry in enumerate(LI):
    
    if entry in lactose_tolerant:
        Lactose_Tolerant.append(df["genotype_filename"][i])

    elif entry == "-":
        Lactose_no_value.append(df["genotype_filename"][i])

    else:
        Lactose_Intolerant.append(df["genotype_filename"][i])


In [None]:
names = []
is_intolerant = []

# list all the intolerant ones
for i, name in enumerate(Lactose_Intolerant):
    names.append(name)
    is_intolerant.append(1)

# list all the tolerant ones
for i, name in enumerate(Lactose_Tolerant):
    names.append(name)
    is_intolerant.append(0)

#create dataframe with binary phenotype classification
lactose_dataframe = pd.DataFrame({'name': names, 'is_intolerant': is_intolerant})
lactose_dataframe 

#dataframe to csv
#lactose_dataframe.to_csv("..\AD/Lactose_Intolerance.csv", index=False)

#csv with only 23andme names
ld = lactose_dataframe[lactose_dataframe['name'].str.contains('.23andme.')]
final_lactose_dataframe = ld[ld["name"].str.contains("23andme-exome-vcf.") == False]

final_lactose_dataframe.to_csv("final_lactose_intolerance_df.csv", index = False)

#csv with only .vcf files
ld_vcf = lactose_dataframe[lactose_dataframe['name'].str.contains('23andme-exome-vcf.')]
ld_vcf.to_csv("vcf_final_lactose_intolerance_df.csv", index = False)

# Nicotine dependence

### I'm not addicted -> 0
### I was/am addicted -> 1

In [None]:
#dataframe to list
ND = df["Nicotine dependence"].to_list()

#count unique values in ND list
print(set(ND))

#classification of values for people who don't smoke
not_smoking = ["Non-smoker", "Never-non-smoker", "Don't smoke", "Never-non-smoker ", "Never - smoker ", "Never", "Have tried cigarettes - current non smoker  no addiction to nicotine"]

#classification of values for people who don't smoke, smoke and people with no entry
for i,entry in enumerate (ND):

    if entry in not_smoking:
        print(df["genotype_filename"][i],"--> I'm not addicted")
    
    elif entry == "-" or entry == "na":
        print(df["genotype_filename"][i],"no value")

    else:
        print(df["genotype_filename"][i],"--> I was/am addicted")


# Store genotype filename in array

In [None]:
#arrays to append genotype filename regarding phenotype classification
No_Smoker = []
Smoking_no_value = []
Smoker = []


for i,entry in enumerate (ND):

    if entry in not_smoking:
        No_Smoker.append(df["genotype_filename"][i])
    
    elif entry == "-" or entry == "na":
        Smoking_no_value.append(df["genotype_filename"][i])

    else:
        Smoker.append(df["genotype_filename"][i])

In [None]:
names = []
is_addicted = []

# list all the not addicted ones
for i, name in enumerate(No_Smoker):
    names.append(name)
    is_addicted.append(1)

# list all the addicted ones
for i, name in enumerate(Smoker):
    names.append(name)
    is_addicted.append(0)

#create dataframe with binary phenotype classification 
nicotine_dataframe = pd.DataFrame({'name': names, 'is_addicted': is_addicted})
nicotine_dataframe

#dataframe to csv
#nicotine_dataframe.to_csv("..\AD/Nicotine_Dependence.csv",index=False)

nd = nicotine_dataframe[nicotine_dataframe['name'].str.contains('.23andme.')]
final_nicotine_dataframe = nd[nd["name"].str.contains("23andme-exome-vcf.") == False]

final_nicotine_dataframe.to_csv("final_nicotine_dependence_df.csv", index = False)

#csv with only .vcf files
nd_vcf = nicotine_dataframe[nicotine_dataframe['name'].str.contains('23andme-exome-vcf.')]
nd_vcf.to_csv("vcf_final_nicotine_dependence_df.csv", index = False)

# Blood type

### A -> 1
### B -> 2
### O -> 3
### AB -> 4

In [None]:
#dataframe to list
BT = df["Blood type"].to_list()

#count unique values in ND list
print(set(BT))

#classification of values for people with Bloodtype A,O,B,AB
A = ["A-", 'A+ (AO/+-) Non-Secretor', 'A+ (A+/0+) ', 'ABO  Kidd:  AG \tJk(a+b+)', 'A/o -/-',  'A+', 'A+ Secretor / Saliva Non Secretor!!', 'A+ (ao/+-) non-secretor']
O = ['O-', 'O/O +/-', 'o rh negative', 'O+', 'O rh negative', 'O/o +/-']
B = ['b+', 'B-', 'B+']
AB = ['AB-', 'AB+', 'A2B+', 'Ab-', 'Ab+']

for i,entry in enumerate (BT):

    if entry in A:
        print(df["genotype_filename"][i],"--> A")
    
    elif entry in O:
        print(df["genotype_filename"][i],"--> O")

    elif entry in B:
        print(df["genotype_filename"][i],"--> B")

    elif entry in AB:
        print(df["genotype_filename"][i],"--> AB")
    
    else:
        print(df["genotype_filename"][i],"no value")


# Store genotype filename in array

In [None]:
#arrays to append genotype filename regarding phenotype classification
A_Bloodtype = []
O_Bloodtype = []
B_Bloodtype = []
AB_Bloodtype = []
Bloodtype_no_value = []

for i,entry in enumerate (BT):

    if entry in A:
        A_Bloodtype.append(df["genotype_filename"][i])
    
    elif entry in O:
        O_Bloodtype.append(df["genotype_filename"][i])

    elif entry in B:
        B_Bloodtype.append(df["genotype_filename"][i])

    elif entry in AB:
        AB_Bloodtype.append(df["genotype_filename"][i])
    
    else:
        Bloodtype_no_value.append(df["genotype_filename"][i])

In [None]:
names = []
has_Bloodtype = []

# list all wih Bloodtype A
for i, name in enumerate(A_Bloodtype):
    names.append(name)
    has_Bloodtype.append(1)

# list all with Bloodtype B
for i, name in enumerate(B_Bloodtype):
    names.append(name)
    has_Bloodtype.append(2)
    
# list all with Bloodtype 0
for i, name in enumerate(O_Bloodtype):
    names.append(name)
    has_Bloodtype.append(3)


# list all with Bloodtype AB
for i, name in enumerate(AB_Bloodtype):
    names.append(name)
    has_Bloodtype.append(4)

#create dataframe with classification (1-4)
Bloodtype_dataframe = pd.DataFrame({'name': names, 'has_Bloodtype': has_Bloodtype})
Bloodtype_dataframe

#dataframe to csv
#Bloodtype_dataframe.to_csv("..\AD/Bloodtype.csv", index=False)

bd = Bloodtype_dataframe[Bloodtype_dataframe['name'].str.contains('.23andme.')]
final_bloodtype_dataframe = bd[bd["name"].str.contains("23andme-exome-vcf.") == False]

final_bloodtype_dataframe.to_csv("final_bloodtype_df.csv", index = False)

#csv with only .vcf files
bd_vcf = Bloodtype_dataframe[Bloodtype_dataframe['name'].str.contains('23andme-exome-vcf.')]
bd_vcf.to_csv("vcf_final_bloodtype_df.csv", index = False)

# Aspirin Allergy

### No -> 0
### I'm allergic/sensitive/intolerant -> 1

In [None]:
#dataframe to list
AA = df["Aspirin Allergy"].to_list()

#count unique values in ND list
print(set(AA))

#classification of values for people with allergies, with no allergies and people with no entry
for i,entry in enumerate (AA):

    if entry == "No":
        print(df["genotype_filename"][i],"-->",entry)

    elif entry == "-" or entry == "Unknown":
        print(df["genotype_filename"][i],"no value")
        
    else:
        print(df["genotype_filename"][i],"--> I'm allergic/sensitive/intolerant")



# Store genotype filename in array

In [None]:
#arrays to append genotype filename regarding phenotype classification
Aspirin_Allergy =[]
Aspirin_no_value = []
Aspirin_no_Allergy = []


for i,entry in enumerate (AA):

    if entry == "No":
        Aspirin_no_Allergy.append(df["genotype_filename"][i])

    elif entry == "-" or entry == "Unknown":
        Aspirin_no_value.append(df["genotype_filename"][i])
        
    else:
        Aspirin_Allergy.append(df["genotype_filename"][i])

In [None]:
names = []
is_allergic = []

# list all the ones with allergy
for i, name in enumerate(Aspirin_Allergy):
    names.append(name)
    is_allergic.append(1)

# list all the ones with no allergy
for i, name in enumerate(Aspirin_no_Allergy):
    names.append(name)
    is_allergic.append(0)

#create dataframe with binary classification
Aspirin_dataframe = pd.DataFrame({'name': names, 'is_allergic': is_allergic})
Aspirin_dataframe

#dataframe to csv
#Aspirin_dataframe.to_csv("..\AD/Aspirin_Allergy.csv", index=False)

ad = Aspirin_dataframe[Aspirin_dataframe['name'].str.contains('.23andme.')]
final_aspirin_dataframe = ad[ad["name"].str.contains("23andme-exome-vcf.") == False]

final_aspirin_dataframe.to_csv("final_aspirin_allergy_df.csv", index = False)

#csv with only .vcf files
ad_vcf = Aspirin_dataframe[Aspirin_dataframe['name'].str.contains('23andme-exome-vcf.')]
ad_vcf.to_csv("vcf_aspirin_allergy_df.csv", index = False)

# Haplogroup
### Haplo L -> 1
### Haplo M -> 2
### Haplo N -> 3
### Haplo R -> 4

In [None]:
#dataframe to list
mH = df["mtDNA Haplogroup (PhyloTree)"].to_list()

#count unique values in ND list
print(set(mH))

#classification of values for people with Haplogroup L,M,N,R
Macro_haplogroup_L = ['L3e2a1b1', 'L1b1a', 'L2a1c', 'L2b1a', 'L0a1a', 'L1b1a10', 'L2a1c1', 'L2b2']
Macro_haplogroup_M = ['M33a2', 'C5c1a', 'M1a3a', 'M2a1a', 'D5c', 'M9a1a1a4', 'm7b3a', 'D4', 'C1b12', 'D1e']
Macro_haplogroup_N = ['I3a', 'A2', 'I1a1', 'I2a', 'X2b', 'X2b4', 'I2 ', 'X2m', 'X2-G225A', 'S2', 'X2C1', 'i3a1', 'W4A1', 'N1a1a1a', 'X2e2a']

for i,entry in enumerate (mH):

    if entry == "-" or entry == "I don't know " or entry == 'You act like we all are geneologist' or entry == 'X, X2, X2c, X2c1, N, L1,2,3,4&6':
        print(df["genotype_filename"][i],"--> no value")

    elif entry in Macro_haplogroup_L:
        print(df["genotype_filename"][i],"--> Macro-haplogroup L")

    elif entry in Macro_haplogroup_M:
        print(df["genotype_filename"][i],"--> Macro-haplogroup M")

    elif entry in Macro_haplogroup_N:
        print(df["genotype_filename"][i],"--> Macro-haplogroup N")

    else:
        print(df["genotype_filename"][i],"--> Macro-haplogroup R")


# Store genotype filename in array

In [None]:
#arrays to append genotype filename regarding phenotype classification
Haplo_L= []
Haplo_M = []
Haplo_N = []
Haplo_R = []
Haplo_no_value = []

for i,entry in enumerate (mH):

    if entry == "-" or entry == "I don't know " or entry == 'You act like we all are geneologist' or entry == 'X, X2, X2c, X2c1, N, L1,2,3,4&6':
        Haplo_no_value.append(df["genotype_filename"][i])

    elif entry in Macro_haplogroup_L:
        Haplo_L.append(df["genotype_filename"][i])

    elif entry in Macro_haplogroup_M:
        Haplo_M.append(df["genotype_filename"][i])

    elif entry in Macro_haplogroup_N:
        Haplo_N.append(df["genotype_filename"][i])

    else:
        Haplo_R.append(df["genotype_filename"][i])



In [None]:
names = []
has_Haplogroup = []

# list all wih Haplogroup L
for i, name in enumerate(Haplo_L):
    names.append(name)
    has_Haplogroup.append(1)

# list all with Haplogroup M
for i, name in enumerate(Haplo_M):
    names.append(name)
    has_Haplogroup.append(2)
    
# list all with Haplogroup N
for i, name in enumerate(Haplo_N):
    names.append(name)
    has_Haplogroup.append(3)


# list all with Haplogroup R
for i, name in enumerate(Haplo_R):
    names.append(name)
    has_Haplogroup.append(4)

#create dataframe with classification(1-4)
Haplogroup_dataframe = pd.DataFrame({'name': names, 'has_Haplogroup': has_Haplogroup})
Haplogroup_dataframe

#dataframe to csv
#Haplogroup_dataframe.to_csv("..\AD/mtDNA_Haplogroup.csv", index= False)

hd = Haplogroup_dataframe[Haplogroup_dataframe['name'].str.contains('.23andme.')]
final_haplogroup_dataframe = hd[hd["name"].str.contains("23andme-exome-vcf.") == False]

final_haplogroup_dataframe.to_csv("final_mtDNA_Haplogroup_df.csv", index = False)

#csv with only .vcf files
hd_vcf = Haplogroup_dataframe[Haplogroup_dataframe['name'].str.contains('23andme-exome-vcf.')]
hd_vcf.to_csv("vcf_final_mtDNA_Haplogroup_df.csv", index = False)