Import the needed libaries

In [1]:
import pandas as pd
import ast

Read in the CSV file and check the length

In [2]:
dataset = pd.read_csv("CSV Data/Dataset.csv")
print(f"Dataset Size: {len(dataset)}")
dataset.head()

Dataset Size: 14038


Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus,Experimental,Resolution,R Value,R Free
0,2ECW_A,GSSGSSGMASSVLEMIKEEVTCPICLELLKEPVSADCNHSFCRACI...,0.047059,0.035294,0.094118,0.023529,0.082353,0.082353,0.0,0.070588,...,79703822,2.616737,5900,5815,98.559322,5897,solution nmr,,,
1,2BHP_A,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121094,0.072266,0.027344,0.029297,0.0,0.109375,0.019531,0.087891,...,79703822,2.616737,5900,5388,91.322034,5896,x-ray diffraction,1.8,0.148,0.176
2,2BHP_B,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121807,0.072692,0.027505,0.02947,0.0,0.11002,0.019646,0.088409,...,79703822,2.616737,5900,5391,91.372881,5896,x-ray diffraction,1.8,0.148,0.176
3,2D8S_A,GSSGSSGTSITPSSQDICRICHCEGDDESPLITPCHCTGSLHFVHQ...,0.0125,0.025,0.0,0.05,0.1125,0.0625,0.05,0.0875,...,79703822,2.616737,5900,5820,98.644068,5896,solution nmr,,,
4,2DXB_A,VWDRTHHAKMATGIGDPQCFKGMAGKSKFNVGDRVRIKDLPDLFYT...,0.084746,0.059322,0.033898,0.076271,0.008475,0.09322,0.016949,0.067797,...,79703822,2.616737,5900,5782,98.0,5900,x-ray diffraction,2.25,0.171,0.198


Check which columns have missing values

In [3]:
print(f"Values Missing:\n{dataset.isna().any()}")

Values Missing:
ID                                       False
Unaligned Sequence                       False
A                                        False
R                                        False
N                                        False
D                                        False
C                                        False
E                                        False
Q                                        False
G                                        False
H                                        False
I                                        False
L                                        False
K                                        False
M                                        False
F                                        False
P                                        False
S                                        False
T                                        False
W                                        False
Y                                        Fal

Print the total number of values that are missing in the specified columns

In [4]:
print("Total Missing Values in Resolution: ", dataset["Resolution"].isna().sum())
print("Total Missing Values in R Value: ", dataset["R Value"].isna().sum())
print("Total Missing Values in R Free: ", dataset["R Free"].isna().sum())

Total Missing Values in Resolution:  1738
Total Missing Values in R Value:  2279
Total Missing Values in R Free:  2326


Remove the rows where the missing values are in the specified columns

In [5]:
dataset.dropna(subset= ["Resolution", "R Value", "R Free"], inplace=True)
print(f"Dataset Size: {len(dataset)}")

Dataset Size: 11712


Check the column types and change them if needed

In [6]:
float_count, f_array = 0, []
int_count, i_array = 0, []
object_count, o_array = 0, []

for column in dataset.columns:
    if dataset[column].dtype == object:
        object_count += 1
        o_array.append(column)
    elif dataset[column].dtype == float:
        float_count += 1
        f_array.append(column)
    elif dataset[column].dtype == int:
        int_count += 1
        i_array.append(column)

print(f"Number of Columns: {len(dataset.columns)}\nNumber of Columns Checked: {object_count + int_count + float_count}\n")
print(f"Number of Object Columns: {object_count}")
print(f"Object Columns: {o_array}\n")
print(f"Number of Integer Columns: {int_count}")
print(f"Integer Columns: {i_array}\n")
print(f"Number of Float Columns: {float_count}")
print(f"Float Columns: {f_array}\n")

Number of Columns: 43
Number of Columns Checked: 43

Number of Object Columns: 7
Object Columns: ['ID', 'Unaligned Sequence', 'Hydrophobicity (Kyte-Doolittle Scale)', 'Aligned Sequence', 'Consensus Sequence', 'Percentage of Gaps Per Position', 'Experimental']

Number of Integer Columns: 5
Integer Columns: ['Sequence Length_x', 'Total Gaps in Alignment', 'Sequence Length_y', 'Gap Count', 'Mutations from Consensus']

Number of Float Columns: 31
Float Columns: ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'Net Charge at pH 7.0 (Neutral)', 'Net Charge at pH 3.0 (Acidic)', 'Net Charge at pH 11.0 (Basic)', 'Isoelectric Point', 'Molecular Weight', 'Conservation Scores', 'Average Gap Length', 'Percentage Gaps', 'Resolution', 'R Value', 'R Free']



Convert the columns of type object to the correct ones

In [7]:
def literal_eval(x):
    try:
        return ast.literal_eval(x)
    except ValueError:
        return None

for i in range(len(o_array)):
    if o_array[i] in ["ID", "Unaligned Sequence", "Aligned Sequence", "Consensus Sequence", "Experimental"]:
        dataset[o_array[i]] = dataset[o_array[i]].astype(str)
    else:
        dataset[o_array[i]] = dataset[o_array[i]].apply(literal_eval)

for j in range(len(o_array)):
    if o_array[i] not in ["ID", "Unaligned Sequence", "Aligned Sequence", "Consensus Sequence", "Experimental"]:
        dataset = dataset[dataset[o_array[i]].notnull()]

for column in dataset.columns:
    print(f"Column '{column}' has data type: {dataset[column].dtype}")

Column 'ID' has data type: object
Column 'Unaligned Sequence' has data type: object
Column 'A' has data type: float64
Column 'R' has data type: float64
Column 'N' has data type: float64
Column 'D' has data type: float64
Column 'C' has data type: float64
Column 'E' has data type: float64
Column 'Q' has data type: float64
Column 'G' has data type: float64
Column 'H' has data type: float64
Column 'I' has data type: float64
Column 'L' has data type: float64
Column 'K' has data type: float64
Column 'M' has data type: float64
Column 'F' has data type: float64
Column 'P' has data type: float64
Column 'S' has data type: float64
Column 'T' has data type: float64
Column 'W' has data type: float64
Column 'Y' has data type: float64
Column 'V' has data type: float64
Column 'Hydrophobicity (Kyte-Doolittle Scale)' has data type: object
Column 'Net Charge at pH 7.0 (Neutral)' has data type: float64
Column 'Net Charge at pH 3.0 (Acidic)' has data type: float64
Column 'Net Charge at pH 11.0 (Basic)' has

Get all the unique values in the experimental methods

In [8]:
print(dataset["Experimental"].unique())

['x-ray diffraction' 'x-ray diffraction; epr' 'electron crystallography'
 'neutron diffraction']


Get the distribution of the resolution

In [9]:
dataset["Resolution"].describe()

count    11712.000000
mean         2.196363
std          0.512911
min          0.780000
25%          1.840000
50%          2.100000
75%          2.500000
max          7.000000
Name: Resolution, dtype: float64