### One Hot Encoding Gene Sequence

Here is a small program to one hot encode any gene sequence independently of length.
I tried a method from scratch (by-hand) and also one using a package and both gave me the same results.

In [22]:
import numpy as np
import pandas as pd

#Method from scratch

list1 = ["A", "B", "C","D"]
list2 = ["A", "E", "G","D"]
list3 = ["B", "G", "A","F"]

# all features
all_features = sorted(set(list1 + list2 + list3))

# Create function that one encodes the features
def one_hot_encoding(features, all_features):
    one_hot = []
    for feature in all_features:
        if feature in features:
            one_hot.append(1)
        else:
            one_hot.append(0)
    return one_hot

# One hot encode the features
one_hot_encoding(list1, all_features)

df = pd.DataFrame(columns=all_features)
df["samples"] = ["sample1", "sample2", "sample3"]
df = df.set_index("samples")
# Adding to dataframe in a for loop

for i, sample in enumerate([list1, list2, list3]):
    df.loc["sample{}".format(i+1)] = one_hot_encoding(sample, all_features)


# Setting samples as index

print(df)

         A  B  C  D  E  F  G
samples                     
sample1  1  1  1  1  0  0  0
sample2  1  0  0  1  1  0  1
sample3  1  1  0  0  0  1  1


In [1]:
import numpy as np

# Define the sequence of bases
seq = 'CAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAGCCCT'

# Create a dictionary to map each base to an index
base_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

one_hot = np.zeros((len(seq), 4))

# Loop over the sequence and set the appropriate element to 1
for i, base in enumerate(seq):
    one_hot[i, base_to_index[base]] = 1

print(one_hot)

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]


This program one-hot encodes a gene sequence with four base pairs. I also tried with a package.

In [2]:
from sklearn.preprocessing import OneHotEncoder

#one-hot encoder
one_hot_encoder = OneHotEncoder(sparse=False)

seq = 'CAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAGCCCT'
#Splitting into list
seq_list = np.array(list(map(str,seq)))

one_hot = one_hot_encoder.fit_transform(seq_list.reshape(-1, 1))
one_hot

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],


### EXAMPLE for Project

I think the first method serves better if we need to one encode a bunch of sequences like I think we will need to. We will only need to provide a list of features and it would one encode them like the following. We could get these features from a dataframe that has **ALL GENES in one column** and the **other column can contain a list of all the features** for that gene! Then we can extract all lists -> make a set -> **Master list** of features.

In [32]:
#Lets assume these are all the feautures we want to one-hot encode. The "Master List"

features = list(set(['CAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTTAACCCTTA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTTAACCCTTAACCCTTAACCCT',
 'CCTAAACCCTAAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACC',
 'ACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAA',
 'TAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTT',
 'CTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCC',
 'CCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAAC',
 'AACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTA',
 'TTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCT',
 'CCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACC',
 'ACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTAAACCCTAAA',
 'TAACCCTTAACCCTTAACCCTTAACCCTAAACCCTAAACCCTTAACCCTT',
 'CTTAACCCTTAACCCTAAACCCTAAACCCTTAACCCTTAACCCTTAACCC',
 'CCCTAAACCCTAAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAAC',
 'AACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTA',
 'TTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCT',
 'CCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACC',
 'ACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAA',
 'TAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTT',
 'CTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCC',
 'CCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTAAAC',
 'AACCCTTAACCCTTAACCCTTAACCCTTAACCCTAAACCCTAAACCCTAA',
 'TTAACCCTTAACCCTTAACCCTAAACCCTAAACCCTAAACCCTAAACCAT',
 'CCTTAACCCTAAACCCTAAACCCTAAACCCTAAACCATTTTATTCTCAAA',
 'ACCCTAAACCCTAAACCCTAAACCATTTTATTCTCAAATACCCCTATTCT',
 'AAACCCTAAACCATTTTATTCTCAAATACCCCTATTCTCAAATACCCTTT',
 'ATTTTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATA',
 'AATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCA',
 'CTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCAAATACCCATTTT',
 'TTTATTCTCAAATACCCCTATTCTCAAATACCCATTTTCTCAAATACCCC',
 'TACCCCTATTCTCAAATACCCATTTTCTCAAATACCCCTATTCTCAAATA',
 'CAAATACCCATTTTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCA',
 'TTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATATCCCTATT',
 'CCTATTCTCAAATACCCCTATTCTCAAATATCCCTATTCTCAAATACCCT',
 'TACCCCTATTCTCAAATATCCCTATTCTCAAATACCCTTTTATTCTCAAA',
 'CAAATATCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCT',
 'TTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCAAATACCCCTA',
 'CTTTTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCTAATACC',
 'AATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAA',
 'CTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAATACCCTTTTATT',
 'TATTCTCTAATACCCCTATTCTCAAATACCCTTTTATTCTCTAATACCCC',
 'CCCCTATTCTCAAATACCCTTTTATTCTCTAATACCCCTATTCTCAAATA',
 'AATACCCTTTTATTCTCTAATACCCCTATTCTCAAATACCCCTATTCTCA',
 'TTCTCTAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCCTATT',
 'CCTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCC',
 'TACCCCTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCTAATA',
 'CAAATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCA',
 'TTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAATACCCCTATT',
 'CCTATTCTCTAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCC',
 'TACCCCTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATA',
 'CAAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCT',
 'TTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTA',
 'CCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCAAATACC',
 'TACCCTTTTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAA',
 'CTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATT',
 'TATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCTA',
 'CCCCTATTCTCAAATACCCTTTTATTCTCAAATACCTATTCTCAAATACC',
 'AATACCCTTTTATTCTCAAATACCTATTCTCAAATACCCCTATTCTCTAA',
 'TTCTCAAATACCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTCT',
 'TATTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAATACTCCTA',
 'CCCCTATTCTCTAATACCCCTATTCTCAAATACTCCTATTCTCAAATACC',
 'AATACCCCTATTCTCAAATACTCCTATTCTCAAATACCCCTATTCTCAAA',
 'CTCAAATACTCCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATT',
 'TATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCC',
 'CCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCTAATA',
 'AATACCCTTTTATTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCA',
 'TTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAATACCCTTTTA',
 'CCTATTCTCTAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACC',
 'TACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCAAA',
 'CAAATACCCTTTTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCT',
 'TATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTA',
 'CCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAATACC',
 'AATACCCCTATTCTCTAATACCCCTATTCTCAAATACCCCTATTCTCAAA',
 'CTCTAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATT',
 'TATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCC',
 'CCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTCAAATA',
 'AATACCCTTTTATTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCA',
 'TTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTA',
 'CCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACC',
 'TACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCTATTCTCAAATA',
 'CAAATACCCTTTTATTCTCAAATACCTATTCTCAAATACCCCTATTCTCT',
 'TATTCTCAAATACCTATTCTCAAATACCCCTATTCTCTAATACCCCTATT',
 'CCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTCTCAAATACTCC',
 'TACCCCTATTCTCTAATACCCCTATTCTCAAATACTCCTATTCTCAAATA',
 'CTAATACCCCTATTCTCAAATACTCCTATTCTCAAATACCCCTATTCTCA',
 'TTCTCAAATACTCCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTA',
 'CCTATTCTCAAATACCCCTATTCTCAAATACCCTTTTATTCTCAAATACC',
 'TACCCCTATTCTCAAATACCCTTTTATTCTCAAATACCTCTATTCTCTAA',
 'CAAATACCCTTTTATTCTCAAATACCTCTATTCTCTAATACCCATTCTCA',
 'TATTCTCAAATACCTCTATTCTCTAATACCCATTCTCAAATACCATTTTA',
 'CCTCTATTCTCTAATACCCATTCTCAAATACCATTTTATTCTCAAATACC',
 'AATACCCATTCTCAAATACCATTTTATTCTCAAATACCTCAATTCTCAAA',
 'CAAATACCATTTTATTCTCAAATACCTCAATTCTCAAATACTCCTATTCT',
 'TATTCTCAAATACCTCAATTCTCAAATACTCCTATTCTCAAATACCCCTA',
 'CCTCAATTCTCAAATACTCCTATTCTCAAATACCCCTAATCTCAAATACC',
 'AATACTCCTATTCTCAAATACCCCTAATCTCAAATACCCTTATTCTCAAA',
 'CTCAAATACCCCTAATCTCAAATACCCTTATTCTCAAATACCCCTATTTT',
 'TAATCTCAAATACCCTTATTCTCAAATACCCCTATTTTCAAATACCCCTA',
 'CCCTTATTCTCAAATACCCCTATTTTCAAATACCCCTATTCTCTAATACC',
 'AATACCCCTATTTTCAAATACCCCTATTCTCTAATACCCTTATTCTCAAA',
 'TTCAAATACCCCTATTCTCTAATACCCTTATTCTCAAATACCCCTATTCT']))

In [88]:
# Lets assume that there are 10 genes that have 10 random sequences from the "Master list" above

# We will create lists of 10 random sequences from the master list
import random
random.seed(1)
genes = list()

for i in range(10):
    genes.append(random.sample(features, 10))

# Initialize DataFrame
df = pd.DataFrame()

#Making indexes Gene_1 to Gene_10
df["Genes"] = ["Gene_" + str(i) for i in range(1,11)]
df = df.set_index("Genes")

# Features 

for i in range(10):
    df["Features"] = genes
print("You could imagine our data frame could look like this:")
display(df)

You could imagine our data frame could look like this:


Unnamed: 0_level_0,Features
Genes,Unnamed: 1_level_1
Gene_1,[TTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATATCC...
Gene_2,[CCTCAATTCTCAAATACTCCTATTCTCAAATACCCCTAATCTCAA...
Gene_3,[CAAATACCCTTTTATTCTCAAATACCTCTATTCTCTAATACCCAT...
Gene_4,[AATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTC...
Gene_5,[AATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTC...
Gene_6,[CCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTC...
Gene_7,[CCTCTATTCTCTAATACCCATTCTCAAATACCATTTTATTCTCAA...
Gene_8,[TTCAAATACCCCTATTCTCTAATACCCTTATTCTCAAATACCCCT...
Gene_9,[CTCAAATACTCCTATTCTCAAATACCCCTATTCTCAAATACCCTT...
Gene_10,[AACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAAC...


You can imagine the above dataframe being what we can generate (of course a lot MORE features), we could take all those features and make a Master List called Features or all_Features. 

In [95]:
lists_of_features = df["Features"].values.flatten().tolist()

# Nested list comprehension to create a single flattened list
flattened_list = [item for sublist in lists_of_features for item in sublist]

#making a set of the flattened lists features
all_features = sorted(list(set(flattened_list)))


Then we proceed to one hot encode the features.

In [105]:
df_one_hot = pd.DataFrame()

# Create function that one encodes the features
def one_hot_encoding(features, all_features):
    one_hot = []
    for feature in all_features:
        if feature in features:
            one_hot.append(1)
        else:
            one_hot.append(0)
    return one_hot

#Making indexes Gene_1 to Gene_10
df_one_hot["Genes"] = ["Gene_" + str(i) for i in range(1,11)]
df_one_hot = df_one_hot.set_index("Genes")

# Create columns names
features_names = ["feature_" + str(i+1) for i in range(len(all_features))]

# Add 100 columns with the names from the list to the DataFrame
for col in features_names:
    df_one_hot[col] = ""

# Adding one hot encoded features to the data frame
for i in range(len(genes)):
    df_one_hot.loc["Gene_{}".format(i+1)] = one_hot_encoding(genes[i], all_features)

# Adding a target phase (random from 0 to 2*pi for this example)
df_one_hot.insert(0,"Phase_Target" , np.random.uniform(0, 2*np.pi, size=len(df_one_hot)))
df_one_hot

Unnamed: 0_level_0,Phase_Target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gene_1,1.515397,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
Gene_2,2.321466,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
Gene_3,1.105931,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Gene_4,2.055039,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gene_5,0.877363,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Gene_6,1.899757,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gene_7,3.713838,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
Gene_8,4.376705,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
Gene_9,5.130159,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
Gene_10,1.502693,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


As can be seen, even if we plan to one hot encode the features and not the basepairs, this method hot encodes them making it the right shape too!
We can use the one hot encode and multiple conventional ML models to predict the features or sequences that predict a value (Phase in this example). 

I am going to put everything together on a function.

In [198]:
# Writing function to do everything in one go
def create_one_hot_encoding(df):

    '''
    This function takes a dataframe with a column named "Features" and creates a one hot encoding of the features.
    
    Parameters
    ----------
    df : pandas.DataFrame
    Dataframe with a column named "Features" that contains a list of features and a column named "Genes" that contains the gene names.
    Dataframe should also contain column called "Target" that contains the target.


        
    Returns
    ------- 
    df_one_hot : pandas.DataFrame
    Dataframe with one hot encoded features.
    
    '''
  
    #Preprocessing step

    # Making a list of all the features
    lists_of_features = df["Features"].values.flatten().tolist()

    # Nested list comprehension to create a single flattened list
    flattened_list = [item for sublist in lists_of_features for item in sublist]

    #making a set of the flattened lists features (removing duplicates)
    all_features = list(set(flattened_list))

    #  Creating one hot encoding dataframe keeping the same Gene names as index and the Target
    df_one_hot = pd.DataFrame(index=df.index)

    # Create columns names
    features_names = ["feature_" + str(i+1) for i in range(len(all_features))]

    # Add 100 columns with the names from the list to the DataFrame
    for col in features_names:
        df_one_hot[col] = ""

    # Adding one hot encoded features to the data frame
    for i,gene in enumerate(df.index):
        df_one_hot.loc[gene] = one_hot_encoding(df["Features"].values[i], all_features)

    #Adding the target from df
    df_one_hot.insert(0,"Target" , df["Target"])

    return df_one_hot

Let's try with assuming a dataframe looks like:

In [213]:
all_features = list(set(flattened_list))
len(all_features)

70

In [202]:
#Dataframe nees to have a column named "Features" and a column named "Target"
display(df)
#Exporting the dataframe to a csv file
df.to_csv("test.csv")


Unnamed: 0_level_0,Target,Features
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1
Gene_1,1.717405,[TTCTCAAATACCCCTATTCTCAAATACCCCTATTCTCAAATATCC...
Gene_2,4.481926,[CCTCAATTCTCAAATACTCCTATTCTCAAATACCCCTAATCTCAA...
Gene_3,3.121352,[CAAATACCCTTTTATTCTCAAATACCTCTATTCTCTAATACCCAT...
Gene_4,0.739333,[AATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTC...
Gene_5,2.347276,[AATACCCCTATTCTCAAATACCCCTATTCTCTAATACCCCTATTC...
Gene_6,5.212245,[CCCCTATTCTCAAATACCCTTTTATTCTCAAATACCCCTATTCTC...
Gene_7,0.262397,[CCTCTATTCTCTAATACCCATTCTCAAATACCATTTTATTCTCAA...
Gene_8,1.883738,[TTCAAATACCCCTATTCTCTAATACCCTTATTCTCAAATACCCCT...
Gene_9,3.030171,[CTCAAATACTCCTATTCTCAAATACCCCTATTCTCAAATACCCTT...
Gene_10,2.389896,[AACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAACCCTTAAC...


In [201]:
# Calling the function
df_one_hot = create_one_hot_encoding(df)
df_one_hot


Unnamed: 0_level_0,Target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gene_1,1.717405,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Gene_2,4.481926,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
Gene_3,3.121352,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
Gene_4,0.739333,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Gene_5,2.347276,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
Gene_6,5.212245,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,1,1,0,0,0
Gene_7,0.262397,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Gene_8,1.883738,0,0,0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
Gene_9,3.030171,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,1,0
Gene_10,2.389896,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
