# Import the data

In [1]:
import pandas as pd

df = pd.read_csv('../phenotypes_202211021922.csv', delimiter=';')
df

Unnamed: 0,user_id,genotype_filename,date_of_birth,chrom_sex,openhumans_name,Retrognathia (Marfan Syndrome),Eye pigmentation,Vegetarianism/Preference for Meat,Form of foot,Eye color,...,Y-Haplogroup,"white skin, dark blond",Blood type AB Rh+,Side effects of streptomycin,Adverse effects of glucose injections,Acetaminophen Side Effects,"Ibuprofen, NSAIDs",Aftermath of the flu and loss of smell,Acetaminophen allergy,Premature Ejaculation
0,4134,4134.23andme.2800,rather not say,rather not say,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,885,885.23andme-exome-vcf.994,1982,XY,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,919,919.ftdna-illumina.455,rather not say,rather not say,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,2953,2953.ftdna-illumina.1885,1993,XY,-,-,-,-,-,Dark brown,...,-,-,-,-,-,-,-,-,-,-
4,6598,6598.23andme.5001,rather not say,rather not say,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6458,10929,10929.ftdna-illumina.9030,rather not say,rather not say,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
6459,10934,10934.23andme.9032,rather not say,rather not say,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
6460,10935,10935.23andme.9033,rather not say,rather not say,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
6461,10936,10936.ancestry.9035,rather not say,rather not say,-,-,-,-,-,-,...,Q-BZ,-,-,-,-,-,-,-,-,-


# Create dictionary with name of column as key and count of non dash values as values

In [2]:
# exclude first 5 columns
ignore_cols = ["user_id", "genotype_filename", "date_of_birth", "chrom_sex", "openhumans_name"]

#create dictionary 
non_dash_counts = {}

#loop through each column 
for i, col in enumerate(df.columns):
    
    dash_counter = 0

    if col not in ignore_cols:
        #loop through each value and add 1 to counter if value is a dash
        for j, value in enumerate(df[col].tolist()):
            if value == "-":
                dash_counter+=1
        #subtract counter from lenght of dataframe to generate number of non_dash_counts       
        non_dash_counts[col] = len(df[col].tolist())-dash_counter

non_dash_counts

{'Retrognathia (Marfan Syndrome)': 57,
 'Eye pigmentation ': 117,
 'Vegetarianism/Preference for Meat': 267,
 'Form of foot': 43,
 'Eye color': 1703,
 'A+': 119,
 'MSG tastes...': 116,
 'Boldness type': 169,
 'Lynch Syndrome': 41,
 'Early Onset Heart Disease': 32,
 'black': 40,
 'Interstitial cystitis': 38,
 'Jewish Ancestry': 315,
 'Response to Enbrel': 51,
 'Bicuspid aortic valve': 27,
 'Hemochromotosis': 43,
 'ACT science': 19,
 'Hypomagnesemia': 35,
 'ACT math': 30,
 '12': 20,
 'Prolapsed Organ': 31,
 'vi/vim or Emacs': 33,
 'Heart Problems - Long QT Syndrome': 17,
 'chronically sore neck glands': 46,
 'Urticaria': 29,
 'R1b1a2a1a1b': 112,
 'ACT reading': 28,
 'Cramps': 146,
 'Number of Neanderthal variants': 51,
 'Dental decay': 75,
 'Miscarriage/Spontaneous Abortion': 54,
 'Thrombosis': 25,
 'Resistance To Infectious Disease': 41,
 'Enhanced Hippocampal Volume': 19,
 'Significantly increased Risk of Heart Disease': 30,
 'Amount of Body Hair (Male)': 58,
 "Morton's Toe": 214,
 'Pr

# Maximum value in dictionary

In [3]:
#max value from non_dash_counts dictionary
max_value = max(non_dash_counts.values())


#key to max_value
key_max = max(non_dash_counts, key= lambda x: non_dash_counts[x])


# 40 columns with the most non dash values

In [4]:
import heapq
from heapq import nlargest

#keys of 40 highest values
max_40_values = heapq.nlargest(40, non_dash_counts, key = non_dash_counts.get)



# Number of non dash values of five columns, which are used for further analyse

In [5]:
# 5 out of the 40 keys got picked and values got listed
Lactose_intolerance = non_dash_counts.get("Lactose intolerance")
print(Lactose_intolerance)

Nicotine_dependence = non_dash_counts.get("Nicotine dependence")
print(Nicotine_dependence)

Blood_type = non_dash_counts.get("Blood type")
print(Blood_type)

Aspirin_Allergy = non_dash_counts.get("Aspirin Allergy")
print(Aspirin_Allergy)

Haplogroup = non_dash_counts.get("mtDNA Haplogroup (PhyloTree)")
print(Haplogroup)

738
671
550
446
371


# Phenotype characterization

# Lactose intolerance

###  I'm lactose tolerant -> 0
###  I'm lactose intolerant -> 1



In [6]:
#dataframe to list
LI = df["Lactose intolerance"].to_list()

#count unique values in LI list
print(set(LI))

#classification of values for lactose tolerant people
lactose_tolerant = ["Lactose tolerant", "lactose-tolerant", "False", "lactose tolerant", "No"]

#classification of values for lactose tolerant, lactose intolerant people and people with no entry
for i, entry in enumerate(LI):
    
    if entry in lactose_tolerant:
        print(df["genotype_filename"][i],"--> I'm lactose tolerant")

    elif entry == "-":
        print(df["genotype_filename"][i],"no value")

    else:
        print(df["genotype_filename"][i],"--> I'm lactose intolerant")



{'No', 'Problems if i drink more 1 liter milk', 'Genetically tolerant-extrememly lactose intolerant', 'Partially lactose intolerant - surfaces with a lot of dairy products in one day', 'Severe gi pain ', 'lactose tolerant till my 40s then became very intolerant.  Genetics says I am Intolerant.', 'Genetically tolerant but am intolerant', 'rs4988235', 'Lactose-tolerant', 'Lactose-intolerant', '-', 'Lactose intolerant', ' allergic to all forms of dairy ', 'Genetically intolerant but drink raw milk and eat lots of dairy.', 'lactose-tolerant', 'lactose-intolerant', 'False', 'Only with cows milk products', 'Lactose tolerant till my 40s then became very intolerant.  genetics says i am intolerant.', 'No but I seem to have less inflammation in joints if avoid milk products', ' Allergic to all forms of dairy ', 'Genetically intolerant, partial intolerance.', 'Lactose tolerant', 'severe GI pain ', 'lactose tolerant', 'AA', 'Slight lactose intolerance', 'Lactose intolerance appearing with age at m

# Store genotype filename in array

In [7]:
#arrays to append genotype filename regarding phenotype classification
Lactose_Tolerant = []
Lactose_no_value = []
Lactose_Intolerant = []

for i, entry in enumerate(LI):
    
    if entry in lactose_tolerant:
        Lactose_Tolerant.append(df["genotype_filename"][i])

    elif entry == "-":
        Lactose_no_value.append(df["genotype_filename"][i])

    else:
        Lactose_Intolerant.append(df["genotype_filename"][i])


In [8]:
names = []
is_intolerant = []

# list all the intolerant ones
for i, name in enumerate(Lactose_Intolerant):
    names.append(name)
    is_intolerant.append(1)

# list all the tolerant ones
for i, name in enumerate(Lactose_Tolerant):
    names.append(name)
    is_intolerant.append(0)

#create dataframe with binary phenotype classification
lactose_dataframe = pd.DataFrame({'name': names, 'is_intolerant': is_intolerant})
lactose_dataframe 

#dataframe to csv
#lactose_dataframe.to_csv("..\AD/Lactose_Intolerance.csv", index=False)

#csv with only 23andme names
ld = lactose_dataframe[lactose_dataframe['name'].str.contains('.23andme.')]
final_lactose_dataframe = ld[ld["name"].str.contains("23andme-exome-vcf.") == False]

final_lactose_dataframe.to_csv("final_lactose_intolerance_df.csv", index = False)

#csv with only .vcf files
ld_vcf = lactose_dataframe[lactose_dataframe['name'].str.contains('23andme-exome-vcf.')]
ld_vcf.to_csv("vcf_final_lactose_intolerance_df.csv", index = False)

# Nicotine dependence

### I'm not addicted -> 0
### I was/am addicted -> 1

In [9]:
#dataframe to list
ND = df["Nicotine dependence"].to_list()

#count unique values in ND list
print(set(ND))

#classification of values for people who don't smoke
not_smoking = ["Non-smoker", "Never-non-smoker", "Don't smoke", "Never-non-smoker ", "Never - smoker ", "Never", "Have tried cigarettes - current non smoker  no addiction to nicotine"]

#classification of values for people who don't smoke, smoke and people with no entry
for i,entry in enumerate (ND):

    if entry in not_smoking:
        print(df["genotype_filename"][i],"--> I'm not addicted")
    
    elif entry == "-" or entry == "na":
        print(df["genotype_filename"][i],"no value")

    else:
        print(df["genotype_filename"][i],"--> I was/am addicted")


{'Occasional Cigar', 'Ex-Smoker. 25 cigarettes/day', 'Ex-smoker, 7 cigarettes/day', 'Social smoker, no addiction', 'Vape nicotine', 'ex-smoker, 7 cigarettes/day', 'Ex social smoker', 'na', 'Smoker. 10 cigarettes/day', 'Gg', 'dipper, <1 can/day', 'Smoker, 2 a day.', 'Social Smoker, No addiction', 'Smoker. 15 cigarettes/day. Trying to quit.', 'Binge Smoker - 2 or 3 times a week, 10-20 at a time.', 'Ex-smoker, still use nicotine gum', 'Ex-smoker. 15 cigarettes/day', 'Never-non-smoker ', 'Ex-smoker. 60 cigarettes a day', 'Snuff', 'Ex-smoker. 25 cigarettes/day', 'Dipper, <1 can/day', 'Ex smoker. 40 cigarettes a day', 'Current smoker. ~10/day', '-', 'Ex-Smoker. 15 cigarettes/day', 'Ex-smoker, 2 a day', 'ex smoker  3 cigarettes/day', 'Smoker. 15 cigarettes/day. trying to quit.', 'Ex smoker  3 cigarettes/day', 'Ex-smoker, Still Use Nicotine Gum', 'Ex-Smoker. 15-16 cigarettes/day', 'Never - smoker ', 'Ex smoker - now vape', 'Smoker, 20 a day.', 'Ex smoker  3 cigarettes/Day', 'ex-smoker for 11 y

# Store genotype filename in array

In [10]:
#arrays to append genotype filename regarding phenotype classification
No_Smoker = []
Smoking_no_value = []
Smoker = []


for i,entry in enumerate (ND):

    if entry in not_smoking:
        No_Smoker.append(df["genotype_filename"][i])
    
    elif entry == "-" or entry == "na":
        Smoking_no_value.append(df["genotype_filename"][i])

    else:
        Smoker.append(df["genotype_filename"][i])

In [21]:
names = []
is_addicted = []

# list all the not addicted ones
for i, name in enumerate(No_Smoker):
    names.append(name)
    is_addicted.append(1)

# list all the addicted ones
for i, name in enumerate(Smoker):
    names.append(name)
    is_addicted.append(0)

#create dataframe with binary phenotype classification 
nicotine_dataframe = pd.DataFrame({'name': names, 'is_addicted': is_addicted})
nicotine_dataframe

#dataframe to csv
#nicotine_dataframe.to_csv("..\AD/Nicotine_Dependence.csv",index=False)

nd = nicotine_dataframe[nicotine_dataframe['name'].str.contains('.23andme.')]
final_nicotine_dataframe = nd[nd["name"].str.contains("23andme-exome-vcf.") == False]

final_nicotine_dataframe.to_csv("final_nicotine_dependence_df.csv", index = False)

#csv with only .vcf files
nd_vcf = nicotine_dataframe[nicotine_dataframe['name'].str.contains('23andme-exome-vcf.')]
nd_vcf.to_csv("vcf_final_nicotine_dependence_df.csv", index = False)

# Blood type

### A -> 1
### B -> 2
### O -> 3
### AB -> 4

In [12]:
#dataframe to list
BT = df["Blood type"].to_list()

#count unique values in ND list
print(set(BT))

#classification of values for people with Bloodtype A,O,B,AB
A = ["A-", 'A+ (AO/+-) Non-Secretor', 'A+ (A+/0+) ', 'ABO  Kidd:  AG \tJk(a+b+)', 'A/o -/-',  'A+', 'A+ Secretor / Saliva Non Secretor!!', 'A+ (ao/+-) non-secretor']
O = ['O-', 'O/O +/-', 'o rh negative', 'O+', 'O rh negative', 'O/o +/-']
B = ['b+', 'B-', 'B+']
AB = ['AB-', 'AB+', 'A2B+', 'Ab-', 'Ab+']

for i,entry in enumerate (BT):

    if entry in A:
        print(df["genotype_filename"][i],"--> A")
    
    elif entry in O:
        print(df["genotype_filename"][i],"--> O")

    elif entry in B:
        print(df["genotype_filename"][i],"--> B")

    elif entry in AB:
        print(df["genotype_filename"][i],"--> AB")
    
    else:
        print(df["genotype_filename"][i],"no value")


{'A2B+', 'A-', 'B+', 'O+', 'A+ Secretor / Saliva Non Secretor!!', 'ABO  Kidd:  AG \tJk(a+b+)', '-', 'Ab-', 'A+ (a/o) ', 'O/o +/-', 'O-', 'o rh negative', 'Dont know', 'b+', 'B-', 'A+ (ao/+-) non-secretor', 'A+ (AO/+-) Non-Secretor', 'O rh negative', 'O/O +/-', 'Ab+', 'AB-', 'A/o -/-', 'AB+', 'A+ (A+/0+) ', 'A+'}
4134.23andme.2800 no value
885.23andme-exome-vcf.994 no value
919.ftdna-illumina.455 no value
2953.ftdna-illumina.1885 no value
6598.23andme.5001 no value
3110.23andme.2003 no value
3181.23andme.2066 no value
4121.23andme.2789 --> B
4157.ancestry.2818 no value
4152.23andme.2815 no value
4148.23andme.2812 no value
4146.23andme.2810 no value
4142.23andme.2808 no value
4140.23andme.2806 no value
4138.23andme.2805 no value
4137.23andme.2804 no value
4135.ftdna-illumina.2801 no value
4130.23andme.2797 --> A
4129.23andme.2796 no value
4440.ancestry.3043 no value
6601.23andme.5003 no value
4126.23andme.2792 no value
4125.23andme.2791 no value
4120.23andme.2788 no value
4441.23andme.30

# Store genotype filename in array

In [13]:
#arrays to append genotype filename regarding phenotype classification
A_Bloodtype = []
O_Bloodtype = []
B_Bloodtype = []
AB_Bloodtype = []
Bloodtype_no_value = []

for i,entry in enumerate (BT):

    if entry in A:
        A_Bloodtype.append(df["genotype_filename"][i])
    
    elif entry in O:
        O_Bloodtype.append(df["genotype_filename"][i])

    elif entry in B:
        B_Bloodtype.append(df["genotype_filename"][i])

    elif entry in AB:
        AB_Bloodtype.append(df["genotype_filename"][i])
    
    else:
        Bloodtype_no_value.append(df["genotype_filename"][i])

In [22]:
names = []
has_Bloodtype = []

# list all wih Bloodtype A
for i, name in enumerate(A_Bloodtype):
    names.append(name)
    has_Bloodtype.append(1)

# list all with Bloodtype B
for i, name in enumerate(B_Bloodtype):
    names.append(name)
    has_Bloodtype.append(2)
    
# list all with Bloodtype 0
for i, name in enumerate(O_Bloodtype):
    names.append(name)
    has_Bloodtype.append(3)


# list all with Bloodtype AB
for i, name in enumerate(AB_Bloodtype):
    names.append(name)
    has_Bloodtype.append(4)

#create dataframe with classification (1-4)
Bloodtype_dataframe = pd.DataFrame({'name': names, 'has_Bloodtype': has_Bloodtype})
Bloodtype_dataframe

#dataframe to csv
#Bloodtype_dataframe.to_csv("..\AD/Bloodtype.csv", index=False)

bd = Bloodtype_dataframe[Bloodtype_dataframe['name'].str.contains('.23andme.')]
final_bloodtype_dataframe = bd[bd["name"].str.contains("23andme-exome-vcf.") == False]

final_bloodtype_dataframe.to_csv("final_bloodtype_df.csv", index = False)

#csv with only .vcf files
bd_vcf = Bloodtype_dataframe[Bloodtype_dataframe['name'].str.contains('23andme-exome-vcf.')]
bd_vcf.to_csv("vcf_final_bloodtype_df.csv", index = False)

# Aspirin Allergy

### No -> 0
### I'm allergic/sensitive/intolerant -> 1

In [15]:
#dataframe to list
AA = df["Aspirin Allergy"].to_list()

#count unique values in ND list
print(set(AA))

#classification of values for people with allergies, with no allergies and people with no entry
for i,entry in enumerate (AA):

    if entry == "No":
        print(df["genotype_filename"][i],"-->",entry)

    elif entry == "-" or entry == "Unknown":
        print(df["genotype_filename"][i],"no value")
        
    else:
        print(df["genotype_filename"][i],"--> I'm allergic/sensitive/intolerant")



{'Sensitive to aspirin', 'No', 'Yes', 'GI bleed from any NSAIDs', 'Unknown', 'sensitive to aspirin', '-', 'Intolerant but not allergic', 'intolerant but not allergic', 'Gi bleed from any nsaids', 'Was allergic until 35 years old and suddenly was not.'}
4134.23andme.2800 no value
885.23andme-exome-vcf.994 no value
919.ftdna-illumina.455 no value
2953.ftdna-illumina.1885 no value
6598.23andme.5001 no value
3110.23andme.2003 no value
3181.23andme.2066 no value
4121.23andme.2789 no value
4157.ancestry.2818 no value
4152.23andme.2815 no value
4148.23andme.2812 no value
4146.23andme.2810 no value
4142.23andme.2808 no value
4140.23andme.2806 no value
4138.23andme.2805 no value
4137.23andme.2804 no value
4135.ftdna-illumina.2801 no value
4130.23andme.2797 no value
4129.23andme.2796 no value
4440.ancestry.3043 no value
6601.23andme.5003 no value
4126.23andme.2792 no value
4125.23andme.2791 no value
4120.23andme.2788 no value
4441.23andme.3044 no value
4119.23andme.2786 no value
4116.ancestry.27

# Store genotype filename in array

In [16]:
#arrays to append genotype filename regarding phenotype classification
Aspirin_Allergy =[]
Aspirin_no_value = []
Aspirin_no_Allergy = []


for i,entry in enumerate (AA):

    if entry == "No":
        Aspirin_no_Allergy.append(df["genotype_filename"][i])

    elif entry == "-" or entry == "Unknown":
        Aspirin_no_value.append(df["genotype_filename"][i])
        
    else:
        Aspirin_Allergy.append(df["genotype_filename"][i])

In [23]:
names = []
is_allergic = []

# list all the ones with allergy
for i, name in enumerate(Aspirin_Allergy):
    names.append(name)
    is_allergic.append(1)

# list all the ones with no allergy
for i, name in enumerate(Aspirin_no_Allergy):
    names.append(name)
    is_allergic.append(0)

#create dataframe with binary classification
Aspirin_dataframe = pd.DataFrame({'name': names, 'is_allergic': is_allergic})
Aspirin_dataframe

#dataframe to csv
#Aspirin_dataframe.to_csv("..\AD/Aspirin_Allergy.csv", index=False)

ad = Aspirin_dataframe[Aspirin_dataframe['name'].str.contains('.23andme.')]
final_aspirin_dataframe = ad[ad["name"].str.contains("23andme-exome-vcf.") == False]

final_aspirin_dataframe.to_csv("final_aspirin_allergy_df.csv", index = False)

#csv with only .vcf files
ad_vcf = Aspirin_dataframe[Aspirin_dataframe['name'].str.contains('23andme-exome-vcf.')]
ad_vcf.to_csv("vcf_aspirin_allergy_df.csv", index = False)

# Haplogroup
### Haplo L -> 1
### Haplo M -> 2
### Haplo N -> 3
### Haplo R -> 4

In [18]:
#dataframe to list
mH = df["mtDNA Haplogroup (PhyloTree)"].to_list()

#count unique values in ND list
print(set(mH))

#classification of values for people with Haplogroup L,M,N,R
Macro_haplogroup_L = ['L3e2a1b1', 'L1b1a', 'L2a1c', 'L2b1a', 'L0a1a', 'L1b1a10', 'L2a1c1', 'L2b2']
Macro_haplogroup_M = ['M33a2', 'C5c1a', 'M1a3a', 'M2a1a', 'D5c', 'M9a1a1a4', 'm7b3a', 'D4', 'C1b12', 'D1e']
Macro_haplogroup_N = ['I3a', 'A2', 'I1a1', 'I2a', 'X2b', 'X2b4', 'I2 ', 'X2m', 'X2-G225A', 'S2', 'X2C1', 'i3a1', 'W4A1', 'N1a1a1a', 'X2e2a']

for i,entry in enumerate (mH):

    if entry == "-" or entry == "I don't know " or entry == 'You act like we all are geneologist' or entry == 'X, X2, X2c, X2c1, N, L1,2,3,4&6':
        print(df["genotype_filename"][i],"--> no value")

    elif entry in Macro_haplogroup_L:
        print(df["genotype_filename"][i],"--> Macro-haplogroup L")

    elif entry in Macro_haplogroup_M:
        print(df["genotype_filename"][i],"--> Macro-haplogroup M")

    elif entry in Macro_haplogroup_N:
        print(df["genotype_filename"][i],"--> Macro-haplogroup N")

    else:
        print(df["genotype_filename"][i],"--> Macro-haplogroup R")


{'H1b1', 'X2C1', 'V1a', 'U5a1d2a', 'J1b1a', 'U4C1', 'H1n', 'R8a1a3, many H1a calls', 'H1b2', 'h1a1', 'H1ax', 'U5b1d1', 'H15a1', 'W4A1', 'H', 'H1b', 'K1a3a3', 'U8a1', 'I', 'H67', 'J1c5', 'L3e2a1b1', 'J2B1', 'k2b1a1', 'j2a2b2', 'J2a1a1a3', 'H1h1', 'm7b3a', 'K2A6', 'I2 ', 'H2a2b4', 'T2b7a1', 'K1c1b', 'H1h', "H, H30 on James Lick's site", 'J2a1a1a2', 'H1e1a', 'K1a1b1a', 'HV6', 'H7', 'U7b', 'H66a', 'U5b2a2b1', 'H2a1', 'j1b1a', 'X2-G225A', 'P9a2', 'H7b4', 'D5c', 'S2', 'H4a1', 'J1C', 'K2a', 'D1e', 'U3a1c1', 'U2e1*', '-', 'X2b', 'R0a', "H29 from James Lick's website", 'H1', 'X2b4', 'U5b2a1b', 'H23', 'H6', 'H10a1', 'V', 'U5b1d1b', 'H36a', 'H7f', 'U3b', 'T2c1', 'H17', 'i3a1', 'H11a2a', 'H3f', 'U5a1a1h', 'H18', 'R0a2', 'K1a4', 'I3a', 'X2e2a', 'T2b7a2', 'C5c1a', 'M2a1a', 'HV0', 'HV1B2', 'U4c1a', 'H6a1a', 'H4', 'I2a', 'U4a3', 'T2B2', 'j1c ', 'J1c1 ', 'K1c2', 'J2a1a1b', 'H7b6', 'T2b', 'B2c', 'V3', 'J1c3', 'M9a1a1a4', 'U5a1b1', 'H5a1', 'L1b1a', 'H2a5b2', 'H1c', 'J2a1a', 'K1c1', 'U5a1a1', 'U5a2b', 'H7

# Store genotype filename in array

In [19]:
#arrays to append genotype filename regarding phenotype classification
Haplo_L= []
Haplo_M = []
Haplo_N = []
Haplo_R = []
Haplo_no_value = []

for i,entry in enumerate (mH):

    if entry == "-" or entry == "I don't know " or entry == 'You act like we all are geneologist' or entry == 'X, X2, X2c, X2c1, N, L1,2,3,4&6':
        Haplo_no_value.append(df["genotype_filename"][i])

    elif entry in Macro_haplogroup_L:
        Haplo_L.append(df["genotype_filename"][i])

    elif entry in Macro_haplogroup_M:
        Haplo_M.append(df["genotype_filename"][i])

    elif entry in Macro_haplogroup_N:
        Haplo_N.append(df["genotype_filename"][i])

    else:
        Haplo_R.append(df["genotype_filename"][i])



In [24]:
names = []
has_Haplogroup = []

# list all wih Haplogroup L
for i, name in enumerate(Haplo_L):
    names.append(name)
    has_Haplogroup.append(1)

# list all with Haplogroup M
for i, name in enumerate(Haplo_M):
    names.append(name)
    has_Haplogroup.append(2)
    
# list all with Haplogroup N
for i, name in enumerate(Haplo_N):
    names.append(name)
    has_Haplogroup.append(3)


# list all with Haplogroup R
for i, name in enumerate(Haplo_R):
    names.append(name)
    has_Haplogroup.append(4)

#create dataframe with classification(1-4)
Haplogroup_dataframe = pd.DataFrame({'name': names, 'has_Haplogroup': has_Haplogroup})
Haplogroup_dataframe

#dataframe to csv
#Haplogroup_dataframe.to_csv("..\AD/mtDNA_Haplogroup.csv", index= False)

hd = Haplogroup_dataframe[Haplogroup_dataframe['name'].str.contains('.23andme.')]
final_haplogroup_dataframe = hd[hd["name"].str.contains("23andme-exome-vcf.") == False]

final_haplogroup_dataframe.to_csv("final_mtDNA_Haplogroup_df.csv", index = False)

#csv with only .vcf files
hd_vcf = Haplogroup_dataframe[Haplogroup_dataframe['name'].str.contains('23andme-exome-vcf.')]
hd_vcf.to_csv("vcf_final_mtDNA_Haplogroup_df.csv", index = False)