## **1.7_DATASET_masters_remove_atypical_genotype_calls.ipynb**

### GOALS of this script:
* part two of whole merging workflow for the masters
* numbering in accordance to workflow_europe_maize_dataset_prep_masters
* checked the columns of the 600k array info file
* combination/check of:
    * merging_script_final-Copy1.ipynb
    * merging_script_final.ipynb
    * the corresponding number steps
* OUT: filtered_xxx_for_masters.csv

### *Import packages*

In [None]:
import gzip
import allel
import pandas as pd
import numpy as np
import tskit
import tsinfer
import sys
import json
import csv
from IPython.display import SVG
from IPython.display import HTML
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from openpyxl import Workbook
from progressbar import ProgressBar
sns.set_style('white')
sns.set_style('ticks')

### *Import files*

In [None]:
final_columns_unterseer_2016_elite=pd.read_csv("final_columns_unterseer_2016_elite_for_masters.csv",sep="\t")

In [None]:
final_columns_unterseer_2016_elite

In [None]:
final_columns_unterseer_2016_landraces=pd.read_csv("final_columns_unterseer_2016_landraces_for_masters.csv",sep="\t")

In [None]:
final_columns_unterseer_2016_landraces

In [None]:
final_columns_mayer_2017_landraces=pd.read_csv("final_columns_mayer_2017_landraces_for_masters.csv",sep="\t")

In [None]:
final_columns_mayer_2017_landraces

In [None]:
final_columns_mayer_2020_dhlines=pd.read_csv("final_columns_mayer_2020_dhlines_for_masters.csv",sep="\t")

In [None]:
final_columns_mayer_2020_dhlines

### *1.7.1 Get an array with markers from array.txt which are associated with a-typical genotype calls*

In [None]:
#read-in the array.txt
check_strand=pd.read_csv("/Users/kschul38/Documents/tsinfer-project/data/2_processed/europe_maize_dataset_600k/600k_array_info_header_removed.txt", sep="\t", dtype = str)
check_strand

**Print the header for the 600k array info file**

In [None]:
N=16

In [None]:
with open("/Users/kschul38/Documents/tsinfer-project/data/1_raw/600k_array_info_raw.txt") as myfile:
    header = [next(myfile) for x in range(N)]

In [None]:
header

**Check the columns**

In [None]:
#Counter(check_strand["ID"]==check_strand["probe_set_ID"])

In [None]:
#Counter(check_strand["organism"])

In [None]:
#Counter(check_strand["genome_acc"])

In [None]:
#Counter(check_strand["chromosome"])

In [None]:
Counter(check_strand["allele_A"])

In [None]:
Counter(check_strand["allele_B"])

#### *Create boolean to filter the markers*

In [None]:
#get all the genotype-calls from the array.txt
genotype_call=check_strand["alleles_f"].values.ravel()

In [None]:
#check that number of genotype calls is correct
len(genotype_call)

In [None]:
#get a list of all unique calls
unique_values_array = pd.unique(genotype_call)

In [None]:
#unique_values_array

In [None]:
#look at complete list
#unique_values_array

In [None]:
#remove the "typical genotype calls" - so that just indels remain
unique_values_array = np.delete(unique_values_array, [0,1,2,3,4,5])

In [None]:
#check that it only contains a-typical genotype calls
#unique_values_array

In [None]:
#create a boolean over the whole allele column, where typical calls == True & atypical calls == False 
array_marker_filter_table=~check_strand["alleles_f"].isin(unique_values_array)

In [None]:
array_marker_filter_table

In [None]:
Counter(array_marker_filter_table)

#### *Apply the boolean to the array-marker column*

In [None]:
#get all marker IDs from the array.txt
array_non_typical_genotype_calls_marker=check_strand['ID']

In [None]:
#list of all marker IDs
#array_non_typical_genotype_calls_marker

In [None]:
#convert to array
array_non_typical_genotype_calls_marker=array_non_typical_genotype_calls_marker.to_numpy()

In [None]:
#array of all marker IDs
#array_non_typical_genotype_calls_marker

In [None]:
#filter marker array using the inversion of the boolean (TRUE=atypical, FALSE=typical)
array_non_typical_genotype_calls_marker=array_non_typical_genotype_calls_marker[~array_marker_filter_table]

In [None]:
#list of marker IDs that belong to the a-typical genotype calls
array_non_typical_genotype_calls_marker

In [None]:
#check length
len(array_non_typical_genotype_calls_marker)

**Reapply the boolean to the 600k array info txt**

In [None]:
filter_array_600k=~check_strand["ID"].isin(array_non_typical_genotype_calls_marker)

In [None]:
Counter(filter_array_600k)

In [None]:
filtered_array_600k=check_strand[filter_array_600k]

In [None]:
filtered_array_600k

In [None]:
Counter(filtered_array_600k["alleles_f"])

In [None]:
Counter(filtered_array_600k["allele_A"])

In [None]:
Counter(filtered_array_600k["allele_B"])

### *1.7.2 Filter the dataset - Unterseer_elite_2016*

*Create boolean based on Unterseer_elite_2016*

In [None]:
#615884 row number of the table before filtering

In [None]:
#filter out the a-typical genotype calls by first filering the for isin and then using the inverse
filter_boolean_unterseer_elite_2016=~final_columns_unterseer_2016_elite["variants/AD_2016_elite"].isin(array_non_typical_genotype_calls_marker)

In [None]:
Counter(filter_boolean_unterseer_elite_2016)

In [None]:
#615884-6752=609132 row number of the table after filtering

In [None]:
#apply the filter to the unfiltered table
filtered_unterseer_2016_elite_right_pos=final_columns_unterseer_2016_elite[filter_boolean_unterseer_elite_2016]

In [None]:
#the filtered table
#filtered_unterseer_2016_elite_right_pos

### *1.7.3 Filter the dataset - Unterseer_landraces_2016*

*Create boolean based on Unterseer_landraces_2016*

In [None]:
#615884 row number of the table before filtering

In [None]:
#filter out the a-typical genotype calls by first filering the for isin and then using the inverse
filter_boolean_unterseer_landraces_2016=~final_columns_unterseer_2016_landraces["variants/AD_2016_landraces"].isin(array_non_typical_genotype_calls_marker)

In [None]:
Counter(filter_boolean_unterseer_landraces_2016)

In [None]:
#615884-6752=609132 row number of the table after filtering

In [None]:
#apply the filter to the unfiltered table
filtered_unterseer_2016_landraces_right_pos=final_columns_unterseer_2016_landraces[filter_boolean_unterseer_landraces_2016]

In [None]:
#the filtered table
#filtered_unterseer_2016_landraces_right_pos

### *1.7.4 Filter the dataset - Mayer_2017_landraces*

*Create boolean based on Mayer_2017_landraces*

In [None]:
#616201 row number of the table before filtering

In [None]:
#filter out the a-typical genotype calls by first filering the for isin and then using the inverse
filter_boolean_mayer_2017_landraces_suffix=~final_columns_mayer_2017_landraces["index_2017_landraces"].isin(array_non_typical_genotype_calls_marker)

In [None]:
Counter(filter_boolean_mayer_2017_landraces_suffix)

In [None]:
#616201-6759=609442 row number of the table after filtering

In [None]:
#apply the filter to the unfiltered table
filtered_mayer_2017_landraces_suffix=final_columns_mayer_2017_landraces[filter_boolean_mayer_2017_landraces_suffix]

In [None]:
#the filtered table
#filtered_mayer_2017_landraces_suffix

### *1.7.5 Filter the dataset - Mayer_2020_dhlines*

*Create boolean based on Mayer_2020_dhlines*

In [None]:
#616201 row number of the table before filtering

In [None]:
#filter out the a-typical genotype calls by first filering the for isin and then using the inverse
filter_boolean_mayer_2020_dhlines_suffix=~final_columns_mayer_2020_dhlines["marker_2020_dh"].isin(array_non_typical_genotype_calls_marker)

In [None]:
Counter(filter_boolean_mayer_2020_dhlines_suffix)

In [None]:
#616201-6759=609442 row number of the table after filtering

In [None]:
#apply the filter to the unfiltered table
filtered_mayer_2020_dhlines_suffix=final_columns_mayer_2020_dhlines[filter_boolean_mayer_2020_dhlines_suffix]

In [None]:
#the filtered table
filtered_mayer_2020_dhlines_suffix

### *1.7.6 Check if tables now only contain typical genotype calls*

**Unterseer_2016 elite**

In [None]:
#Counter(final_columns_unterseer_2016_elite["alleles_2016_elite"])

In [None]:
Counter(filtered_unterseer_2016_elite_right_pos["alleles_2016_elite"])

**Unterseer_2016 landraces**

In [None]:
#Counter(final_columns_unterseer_2016_landraces["alleles_2016_landraces"])

In [None]:
Counter(filtered_unterseer_2016_landraces_right_pos["alleles_2016_landraces"])

**Mayer_2017 landraces**

In [None]:
mayer_2017_landraces_get_alleles_before_filter=final_columns_mayer_2017_landraces.drop(['index_2017_landraces','chr_v4_2017_landraces','pos_v4_2017_landraces','quality_2017_landraces'], axis = 1)

In [None]:
#mayer_2017_landraces_get_alleles_before_filter

In [None]:
mayer_2017_all_geno_values_before_filter = mayer_2017_landraces_get_alleles_before_filter.values.ravel()

In [None]:
unique_values_mayer_2017_before_filter = pd.unique(mayer_2017_all_geno_values_before_filter)

In [None]:
unique_values_mayer_2017_before_filter

*Filtered*

In [None]:
mayer_2017_landraces_get_alleles=filtered_mayer_2017_landraces_suffix.drop(['index_2017_landraces','chr_v4_2017_landraces','pos_v4_2017_landraces','quality_2017_landraces'], axis = 1)

In [None]:
#mayer_2017_landraces_get_alleles

In [None]:
mayer_2017_all_geno_values = mayer_2017_landraces_get_alleles.values.ravel()

In [None]:
unique_values_mayer_2017 = pd.unique(mayer_2017_all_geno_values)

In [None]:
unique_values_mayer_2017 

**Mayer_2020 dhlines**

In [None]:
mayer_2020_dhlines_get_alleles_before_filter=final_columns_mayer_2020_dhlines.drop(['marker_2020_dh','chr_2020_dh','pos_2020_dh','quality_2020_dh'], axis = 1)

In [None]:
mayer_2020_dhlines_get_alleles_before_filter

In [None]:
mayer_2020_all_geno_values_before_filter = mayer_2020_dhlines_get_alleles_before_filter.values.ravel()

In [None]:
unique_values_mayer_2020_before_filter = pd.unique(mayer_2020_all_geno_values_before_filter)

In [None]:
unique_values_mayer_2020_before_filter

*Filtered*

In [None]:
mayer_2020_dhlines_get_alleles=filtered_mayer_2020_dhlines_suffix.drop(['marker_2020_dh','chr_2020_dh','pos_2020_dh','quality_2020_dh'], axis = 1)

In [None]:
mayer_2020_dhlines_get_alleles

In [None]:
mayer_2020_all_geno_values =mayer_2020_dhlines_get_alleles.values.ravel()

In [None]:
unique_values_mayer_2020 = pd.unique(mayer_2020_all_geno_values)

In [None]:
unique_values_mayer_2020

## **OUTPUT**

### **Write to file**

In [None]:
#filtered_unterseer_2016_elite_right_pos.to_csv("filtered_unterseer_2016_elite_for_masters.csv", sep="\t", index = False)

In [None]:
#filtered_unterseer_2016_landraces_right_pos.to_csv("filtered_unterseer_2016_landraces_for_masters.csv", sep="\t", index = False)

In [None]:
#filtered_mayer_2017_landraces_suffix.to_csv("filtered_mayer_2017_landraces_for_masters.csv",sep="\t", index = False)

In [None]:
#filtered_mayer_2020_dhlines_suffix.to_csv("filtered_mayer_2020_dhlines_for_masters.csv",sep="\t", index = False)