# Nicotine Dependence
## Import dataframe which contains usernames and according phenotype

In [None]:
import pandas as pd

df = pd.read_csv('final_nicotine_dependence_df.csv')
raw_names = df["name"].tolist()
df

## Change names in column "name"

In [None]:
temp_user = None

processed_names = []

sample_string = "836.23andme.413"

# delete ".23andme" and replace with "_file" and add "user" in front
def clean_name(filename: str):
    garbage = ".23andme."
    temp_string = filename.replace(garbage,"_file")
    return "user"+temp_string
new_names = []
for i in raw_names:
    new_names.append(clean_name(i))

df["name"] = new_names

df


## Access files in directory
### all files

In [None]:
import os 

directory = "D:/Leah Data/patientFiles"
for filename in os.scandir(directory):
    if filename.is_file():
        print(filename.path)

### files which end with 23andme.txt and 23andme-exome-vcf.txt

In [None]:
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith("23andme.txt") or filename.endswith("23andme-exome-vcf.txt"):
        print(file)

### 23andme.txt files from dataframe, patients nicotine dependence

In [None]:
# store names of 23andme.txt files in list
patient_files_nicotine = []

for file in os.listdir(directory):
    if file.startswith(tuple(new_names)):
        patient_files_nicotine.append(file)

print(len(patient_files_nicotine))

## Iterating over files to clear and save as parquet

In [None]:
import os
import pandas as pd
from time import sleep

#check if lines include a Hash
def hasHash(line: str):
    if "#" in line:
        return True
    else:
        return False

#get RSID from textfile
def getRSID(line: str):
        return line[:line.index("\t")]

#get genotype from textfile
def getGenotype(line: str):
    if line[-3] == "\t":
        return line[-2:-1]
    else:
        return line[-3:-1]

#get position from textfile
def getPosition(line: str):
    counter = 0
    for i, char in enumerate(line):
        if char == "\t":
            counter = counter + 1

        if counter == 2:
            x = line[i+1:]
            return x[:x.index("\t")]


file_location = 'D:/Leah Data/patientFiles/'

successOpen = 0
failOpen = 0
failOpenArray = []

run = False

if run:

    # iterate through each filename in the list
    for i, fileName in enumerate(patient_files_nicotine):
        rsid = []
        position = []
        genotype = []

        #### Condition to check if we want to open the file
        print("Trying to open: ", fileName)
        
        # OPEN
        try:
            with open (file_location + fileName) as nicotinefile:
            #nicotinefile = open(file_location + fileName)
                fileContents = nicotinefile.readlines()
                
                # HERE GOES THE PROCESSING - FILE IS OPENED
                for line in fileContents:
                    if hasHash(line)==False:
                        rsid.append(getRSID(line))
                        genotype.append(getGenotype(line))
                        position.append(getPosition(line))
                        
                # SAVE FILE WITH THE ACC. DATA
                n = {"rsid":rsid, "position":position, "genotype":genotype}
                
                df_nicotine = pd.DataFrame(data = n)
                
                name="D:/nicotine_parquets/"+str(fileName)+".parquet"
                df_nicotine.to_parquet(name)
                
                successOpen = successOpen+1
                try:
                    del df_nicotine
                except:
                    print("Caught error for del df_: Continuining without problem")
        except:
            failOpen+=1
            failOpenArray.append(fileName)

        #if i == 3:
            #break

    print(f"Success opens: {successOpen}\nFailed opens: {failOpen}")

    print("File names of all failed files:")
    print(failOpenArray)

## Data Cleaning

In [None]:
import pandas as pd

#Tracker to track RSIDs
RSID_Tracker = {}

#Condition if rsid starts with "r"
def rsidIsAllowed(rsid: str):
	if  rsid[0]=="r":
		return True
	else:
		return False

#Condition if genotype has an empty value or is smaller than 2 letters
def genotypeIsAllowed(genomeValue: str):
	if genomeValue == "--" or len(genomeValue)<2:
		return False
	else:
		return True

import os


read = False

clean_file_direction = "D:/nicotine_parquets/"


if read:
	common_rsid_counter = 0

	# Iterate through each parquet file and open
	for i, filename in enumerate(os.scandir("D:/nicotine_parquets")):		
		# if i == 5:
		# 	break

		# Read parquet file
		print(filename, "opened! Operating now...")
		patient = pd.read_parquet(filename)
		# Initiate new column fields for new "updated" df
		newRSID = []
		newGenotype=[]
		newPos = []

		# 1. get a list from genotype column (or make for loop)
		OG_genotype = patient["genotype"].tolist()
		
		# # 2 get a list from rsid column
		OG_RSID = patient["rsid"].tolist()

		# # 2.5 get list from positions
		OG_pos = patient["position"].tolist()

		print(f"found {len(OG_genotype)} genotypes")

		# 3. for each value in the list, check if genotype is allowed + for each value in the list, check if ID is allowed
		for j, genome in enumerate(OG_genotype):

			if genotypeIsAllowed(genome) and rsidIsAllowed(OG_RSID[j]):
				newGenotype.append(genome)
				newPos.append(OG_pos[j])
				newRSID.append(OG_RSID[j])
			
			# IN CASE OF ERROR TRACKING UNCOMMENT THE CODE BELOW
			
			# elif not rsidIsAllowed(OG_RSID[j]):
			# 	print("NOT ACCEPTING:", OG_RSID[j],". SKIPPING", OG_RSID[j])
			# elif not genotypeIsAllowed(genome):
			# 	print("NOT ACCEPTING:", genome,". SKIPPING",OG_RSID[j])

		# STORE THE RESULTS AS PARQUET FILES
		new_data = {"rsid":newRSID, "position": newPos, "genotype": newGenotype }
		df_new_patient = pd.DataFrame(data = new_data)

		df_new_patient.to_parquet(clean_file_direction + filename.name )
		print("Stored!")

## Count all RSIDs and unique RSIDs

In [None]:
allRSIDS_nicotine = []
# allUniqueRSIDS = []

clean_file_direction = "D:/nicotine_parquets/"
read = False 

if read:

    # Iterate through parquet files
    for i, filename in enumerate(os.scandir(clean_file_direction)):
        print(filename , 'OPEN')
        file = pd.read_parquet(filename)

        # add every single rsid to list
        patientRSIDS = file["rsid"].tolist()
    
        for j, snp in enumerate(patientRSIDS):
            allRSIDS_nicotine.append(snp)
        
        # if snp not in allUniqueRSIDS:
        #     allUniqueRSIDS.append(snp)

        # allRSIDS_nicotine.append(patientRSIDS)
        print("Done. Next:")


    try:
        print(len(allRSIDS_nicotine))
    except:
        print("Couldn't print LEN for some reason")

In [2]:
# Create .csv file with all listed RSIDS 
import pandas as pd 
#r = pd.DataFrame({"all_rsids":allRSIDS_nicotine})
#r

#r.to_csv("ALL_RSIDS_nicotine.csv", index=False)

## List rsids in patients and count them

In [None]:
RSID_Tracker = {}

frame = pd.read_csv("ALL_RSIDS_nicotine.csv")

print("Opening done.")
print("Counting now.")

read = False
if read:

    # Iterate through each RSID in list and count how often they occure
    for i, rsid in enumerate(frame["all_rsids"].tolist()):
    
    
        if rsid in RSID_Tracker:
            RSID_Tracker[rsid]+=1
        else:
            RSID_Tracker[rsid] = 1

    import json

    # Save results as jason file
    print("Done. Attempting to save as JSON...")
    with open('counted_RSIDS_nicotine.json', 'w') as fp:
        json.dump(RSID_Tracker, fp)

    print("Done!")

## rsids which occure at least in 97% of patients

In [None]:
import json
import pandas as pd

# Calculation of in how many patients a rsid needs to occure to show a minimun occurence og 97%
n_minimum_occurence = int((418/100)* 97)+1
print(n_minimum_occurence)

# Open file
print("Opening JSON FILE")

f = open("counted_RSIDS_nicotine.json")
data = json.load(f)

print(len(data))

# Create list with common rsids
most_common_rsids_nicotine_patients = []

for i, key in enumerate(data):
    # print(i)
    if data[key] >= n_minimum_occurence:

        if key not in most_common_rsids_nicotine_patients:
            most_common_rsids_nicotine_patients.append(key)

f.close()
print(len(most_common_rsids_nicotine_patients))

store = False

# Create .csv file to store thr most common rsids
if store:
    print("Done. Attempting to store....")
    common_rsids = pd.DataFrame({"common_rsids":most_common_rsids_nicotine_patients})
    common_rsids.to_csv("common_rsids_nicotine.csv", index=False)
    print("Done storing.")

## Get username of patients with most common rsids

In [None]:
c = pd.read_csv("common_rsids_nicotine.csv")

# Create a list of common rsids and look at the lenght of the list
commons = c["common_rsids"].tolist()
len(commons)

user_with_common_rsids = []

user_names = []

run = True

if run:

# # If 100% of rsid are in file
    def isFit(patient_rsid_list: list, necessary: list):
        
        is_included = 0
        
        for k, value in enumerate(necessary):
            if value in patient_rsid_list:
                is_included+=1
        
        if is_included/len(necessary)>=1:
            return True
        else:
            return False

    clean_file_direction = "D:/nicotine_parquets/"

    for j, filename in enumerate(os.scandir(clean_file_direction)):
        
        # Open file and read
        print(j, filename, 'OPEN')
        
        file = pd.read_parquet(filename)

        if isFit(file["rsid"].tolist(),commons):
            print(filename,"is fit")
            user_names.append(filename.name)
        else:
            print(filename," NOT FIT !!!!!!")

    print(user_names)

    users = pd.DataFrame({"userID":user_names})

    users.to_csv("5k_Nicotine_Users_Cleaned.csv", index=False)

: 

## Get username, rsid and genotype and store it in a dataframe

In [1]:
import pandas as pd
import numpy as np
import warnings
import time

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

def get_genotype_values(patientID: str, target_snps: list):    
    Userfile = pd.read_parquet(clean_file_direction + patientID)
    print(f"Opened {user.index(patientID)}/{len(user)}: {patientID}.")

    start_timestamp=time.time()

    patient_rsids = Userfile["rsid"].tolist()
    patient_genotype_list = Userfile["genotype"].tolist()
    new_row_vals = []

    for s, gene in enumerate(target_snps):
        try:
            new_row_vals.append(patient_genotype_list[patient_rsids.index(gene)])
        except:
            print(f"ERROR: {gene} couldn't identify val for {patientID}")
            new_row_vals.append(np.nan)
        #if s == 3:
            #break
    end = time.time()

    
    process_time = str(end-start_timestamp)[:5]
    print(f"This file took {process_time} seconds.\n")

    return new_row_vals
   


users = pd.read_csv('5k_Nicotine_Users_Cleaned.csv')
c = pd.read_csv("common_rsids_nicotine.csv")
user = users["userID"].tolist()
GenoTypes = pd.DataFrame({"userID": user})

clean_file_direction = "D:/nicotine_parquets/"
print("Constructing empty dataframe structure...")
col_vals = c["common_rsids"].tolist()



# Update all rows using labda function
gen_vals = GenoTypes.apply(lambda row: get_genotype_values(row["userID"],col_vals), axis = 1)
print(gen_vals)

for i in col_vals:
    GenoTypes[i] = gen_vals
print("Finished construction.")


for i, u in enumerate(user):
    new_row = gen_vals[i]
    for j, col in enumerate(GenoTypes.columns):
        if col == "userID":
            pass
        else:
            GenoTypes.at[i, col] = new_row[j-1]

GenoTypes.to_csv("Nicotine_Dependence_DF.csv", index=False)
print("Storing done.")
#print(GenoTypes)

Constructing empty dataframe structure...
Opened 0/404: user10020_file8283_yearofbirth_unknown_sex_unknown.23andme.txt.parquet.
This file took 16.36 seconds.

Opened 1/404: user1020_file507_yearofbirth_unknown_sex_unknown.23andme.txt.parquet.
This file took 28.44 seconds.

Opened 2/404: user1024_file510_yearofbirth_1971_sex_XX.23andme.txt.parquet.
This file took 28.02 seconds.

Opened 3/404: user1028_file514_yearofbirth_unknown_sex_unknown.23andme.txt.parquet.
This file took 28.68 seconds.

Opened 4/404: user1029_file515_yearofbirth_unknown_sex_unknown.23andme.txt.parquet.
This file took 28.51 seconds.

Opened 5/404: user10300_file8552_yearofbirth_unknown_sex_unknown.23andme.txt.parquet.
This file took 17.56 seconds.

Opened 6/404: user10374_file8624_yearofbirth_1973_sex_XX.23andme.txt.parquet.
This file took 28.79 seconds.

Opened 7/404: user1038_file520_yearofbirth_unknown_sex_unknown.23andme.txt.parquet.
This file took 28.46 seconds.

Opened 8/404: user1039_file521_yearofbirth_1958_

In [2]:
## THIS IS THE FINAL DATAFRAME 
finished_nicotine_df = pd.read_csv("Nicotine_Dependence_DF.csv")
finished_nicotine_df

Unnamed: 0,userID,rs2455144,rs693734,rs351615,rs12083131,rs12035499,rs10799145,rs7534404,rs630153,rs770718,...,rs133662,rs4823776,rs132231,rs130191,rs6009503,rs134461,rs137878,rs8142229,rs131749,rs9616812
0,user10020_file8283_yearofbirth_unknown_sex_unk...,AG,TT,AG,GG,GG,GG,AG,GG,CT,...,GT,GG,GG,CT,AG,CT,AG,CC,CC,CC
1,user1020_file507_yearofbirth_unknown_sex_unkno...,GG,CT,AG,AG,AG,TT,GG,GG,CC,...,GT,GG,AG,CT,GG,CT,AG,CT,CT,CT
2,user1024_file510_yearofbirth_1971_sex_XX.23and...,AA,TT,AA,AG,AG,GG,GG,AG,CC,...,GG,GG,AG,CT,AG,TT,GG,CC,CC,TT
3,user1028_file514_yearofbirth_unknown_sex_unkno...,AG,CT,AA,GG,GG,GT,GG,AG,CT,...,GT,AG,GG,TT,AG,TT,GG,CT,CT,CC
4,user1029_file515_yearofbirth_unknown_sex_unkno...,AA,TT,AA,AG,AG,GT,GG,GG,CC,...,GT,AG,AG,CC,AA,TT,GG,CT,CC,CT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,user9822_file8081_yearofbirth_1997_sex_XY.23an...,GG,TT,AG,AG,AA,GG,AG,GG,CC,...,GT,GG,GG,TT,AG,CC,GG,CC,CC,CC
400,user9869_file8128_yearofbirth_unknown_sex_unkn...,AG,CT,AA,AA,AG,TT,AA,GG,CT,...,GT,GG,AG,CC,AG,CC,GG,TT,CT,TT
401,user990_file492_yearofbirth_unknown_sex_unknow...,AG,TT,AG,GG,GG,GG,GG,AG,CC,...,TT,AG,AG,CC,GG,CT,GG,CT,CT,CC
402,user9928_file8189_yearofbirth_unknown_sex_unkn...,GG,CT,AA,AG,GG,TT,AG,AG,TT,...,GT,GG,AG,CT,AG,TT,AG,CC,CC,CT
