In [1]:
# Import needed libraries
import pandas as pd
import ast

In [2]:
# Read in the Dataset.csv file and check the length
dataset = pd.read_csv("Dataset.csv")
print(f"Dataset Size: {len(dataset)}")
dataset.head()

Dataset Size: 11175


Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus,Experimental,Resolution,R Value,R Free
0,2VO6_A,ESVKEFLAKAKEDFLKKWENPAQNTAHLQFERIKTLGTGSFGRVML...,0.06006,0.045045,0.045045,0.051051,0.006006,0.078078,0.036036,0.06006,...,53679017,3.557247,5024,4691,93.371815,5024,x-ray diffraction,1.97,0.172,0.229
1,2VO6_I,TTYADFIASGRTGRRNAIHD,0.15,0.15,0.05,0.1,0.0,0.0,0.0,0.1,...,53679017,3.557247,5024,5004,99.601911,5024,x-ray diffraction,1.97,0.172,0.229
2,3DNK_A,GGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDG...,0.028708,0.028708,0.047847,0.047847,0.019139,0.07177,0.043062,0.043062,...,53679017,3.557247,5024,4815,95.839968,5024,x-ray diffraction,2.84,0.24,0.32
3,3DNK_B,GGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDG...,0.028571,0.028571,0.047619,0.047619,0.019048,0.071429,0.042857,0.042857,...,53679017,3.557247,5024,4814,95.820064,5024,x-ray diffraction,2.84,0.24,0.32
4,3GLT_A,GKLSLQDVAELIRARACQRVVVMVGAGISTPSGIPDFRSPGSGLYS...,0.062044,0.069343,0.021898,0.054745,0.018248,0.058394,0.032847,0.072993,...,53679017,3.557247,5024,4750,94.546178,5024,x-ray diffraction,2.1,0.195,0.228


In [5]:
# Check which columns have missing values
print(f"Values Missing:\n{dataset.isna().any()}")

Values Missing:
ID                                       False
Unaligned Sequence                       False
A                                        False
R                                        False
N                                        False
D                                        False
C                                        False
E                                        False
Q                                        False
G                                        False
H                                        False
I                                        False
L                                        False
K                                        False
M                                        False
F                                        False
P                                        False
S                                        False
T                                        False
W                                        False
Y                                        Fal

In [6]:
# Print the total number of values that are missing in the specified columns from the last cell
print("Total Missing Values in Resolution: ", dataset["Resolution"].isna().sum())
print("Total Missing Values in R Value: ", dataset["R Value"].isna().sum())
print("Total Missing Values in R Free: ", dataset["R Free"].isna().sum())

Total Missing Values in Resolution:  380
Total Missing Values in R Value:  783
Total Missing Values in R Free:  805


In [7]:
# Remove the rows where the missing values are in the sepcified columns
dataset.dropna(subset= ["Resolution", "R Value", "R Free"], inplace=True)
print(f"Dataset Size: {len(dataset)}")

Dataset Size: 10370


In [9]:
# We need to check the column types and change them if needed
float_count, f_array = 0, []
int_count, i_array = 0, []
object_count, o_array = 0, []

for column in dataset.columns:
    if dataset[column].dtype == object:
        object_count += 1
        o_array.append(column)
    elif dataset[column].dtype == float:
        float_count += 1
        f_array.append(column)
    elif dataset[column].dtype == int:
        int_count += 1
        i_array.append(column)

print(f"Number of Columns: {len(dataset.columns)}\nNumber of Columns Checked: {object_count + int_count + float_count}\n")
print(f"Number of Object Columns: {object_count}")
print(f"Object Columns: {o_array}\n")
print(f"Number of Integer Columns: {int_count}")
print(f"Integer Columns: {i_array}\n")
print(f"Number of Float Columns: {float_count}")
print(f"Float Columns: {f_array}\n")

Number of Columns: 43
Number of Columns Checked: 43

Number of Object Columns: 7
Object Columns: ['ID', 'Unaligned Sequence', 'Hydrophobicity (Kyte-Doolittle Scale)', 'Aligned Sequence', 'Consensus Sequence', 'Percentage of Gaps Per Position', 'Experimental']

Number of Integer Columns: 5
Integer Columns: ['Sequence Length_x', 'Total Gaps in Alignment', 'Sequence Length_y', 'Gap Count', 'Mutations from Consensus']

Number of Float Columns: 31
Float Columns: ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'Net Charge at pH 7.0 (Neutral)', 'Net Charge at pH 3.0 (Acidic)', 'Net Charge at pH 11.0 (Basic)', 'Isoelectric Point', 'Molecular Weight', 'Conservation Scores', 'Average Gap Length', 'Percentage Gaps', 'Resolution', 'R Value', 'R Free']



In [10]:
# We have to convert the columns of type object to the correct ones
def literal_eval(x):
    try:
        return ast.literal_eval(x)
    except ValueError:
        return None

for i in range(len(o_array)):
    if o_array[i] in ["ID", "Unaligned", "Aligned Sequence", "Consensus Sequence", "Experimental"]:
        dataset[o_array[i]] = dataset[o_array[i]].astype(str)
    else:
        dataset[o_array[i]] = dataset[o_array[i]].apply(literal_eval)

for j in range(len(o_array)):
    if o_array[i] not in ["ID", "Unaligned", "Aligned Sequence", "Consensus Sequence", "Experimental"]:
        dataset = dataset[dataset[o_array[i]].notnull()]

for column in dataset.columns:
    print(f"Column '{column}' has data type: {dataset[column].dtype}")

Column 'ID' has data type: object
Column 'Unaligned Sequence' has data type: object
Column 'A' has data type: float64
Column 'R' has data type: float64
Column 'N' has data type: float64
Column 'D' has data type: float64
Column 'C' has data type: float64
Column 'E' has data type: float64
Column 'Q' has data type: float64
Column 'G' has data type: float64
Column 'H' has data type: float64
Column 'I' has data type: float64
Column 'L' has data type: float64
Column 'K' has data type: float64
Column 'M' has data type: float64
Column 'F' has data type: float64
Column 'P' has data type: float64
Column 'S' has data type: float64
Column 'T' has data type: float64
Column 'W' has data type: float64
Column 'Y' has data type: float64
Column 'V' has data type: float64
Column 'Hydrophobicity (Kyte-Doolittle Scale)' has data type: object
Column 'Net Charge at pH 7.0 (Neutral)' has data type: float64
Column 'Net Charge at pH 3.0 (Acidic)' has data type: float64
Column 'Net Charge at pH 11.0 (Basic)' has