Import the needed libaries

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

Read in the CSV file and check the length

In [14]:
dataset = pd.read_csv("CSV Data/Dataset.csv")
print(f"Dataset Size: {len(dataset)}")
dataset.head()

Dataset Size: 14038


Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus,Experimental,Resolution,R Value,R Free
0,2ECW_A,GSSGSSGMASSVLEMIKEEVTCPICLELLKEPVSADCNHSFCRACI...,0.047059,0.035294,0.094118,0.023529,0.082353,0.082353,0.0,0.070588,...,79703822,2.616737,5900,5815,98.559322,5897,solution nmr,,,
1,2BHP_A,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121094,0.072266,0.027344,0.029297,0.0,0.109375,0.019531,0.087891,...,79703822,2.616737,5900,5388,91.322034,5896,x-ray diffraction,1.8,0.148,0.176
2,2BHP_B,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,0.121807,0.072692,0.027505,0.02947,0.0,0.11002,0.019646,0.088409,...,79703822,2.616737,5900,5391,91.372881,5896,x-ray diffraction,1.8,0.148,0.176
3,2D8S_A,GSSGSSGTSITPSSQDICRICHCEGDDESPLITPCHCTGSLHFVHQ...,0.0125,0.025,0.0,0.05,0.1125,0.0625,0.05,0.0875,...,79703822,2.616737,5900,5820,98.644068,5896,solution nmr,,,
4,2DXB_A,VWDRTHHAKMATGIGDPQCFKGMAGKSKFNVGDRVRIKDLPDLFYT...,0.084746,0.059322,0.033898,0.076271,0.008475,0.09322,0.016949,0.067797,...,79703822,2.616737,5900,5782,98.0,5900,x-ray diffraction,2.25,0.171,0.198


Check which columns have missing values

In [15]:
print(f"Values Missing:\n{dataset.isna().any()}")

Values Missing:
ID                                       False
Unaligned Sequence                       False
A                                        False
R                                        False
N                                        False
D                                        False
C                                        False
E                                        False
Q                                        False
G                                        False
H                                        False
I                                        False
L                                        False
K                                        False
M                                        False
F                                        False
P                                        False
S                                        False
T                                        False
W                                        False
Y                                        Fal

Print the total number of values that are missing in the specified columns

In [16]:
print("Total Missing Values in Resolution: ", dataset["Resolution"].isna().sum())
print("Total Missing Values in R Value: ", dataset["R Value"].isna().sum())
print("Total Missing Values in R Free: ", dataset["R Free"].isna().sum())

Total Missing Values in Resolution:  1738
Total Missing Values in R Value:  2279
Total Missing Values in R Free:  2326


Remove the rows where the missing values are in the specified columns

In [17]:
dataset.dropna(subset= ["Resolution", "R Value", "R Free"], inplace=True)
print(f"Dataset Size: {len(dataset)}")

Dataset Size: 11712


Get all the unique values in the experimental methods

In [18]:
print(dataset["Experimental"].unique())

['x-ray diffraction' 'x-ray diffraction; epr' 'electron crystallography'
 'neutron diffraction']


Get the distribution of the resolution

In [19]:
dataset["Resolution"].describe()

count    11712.000000
mean         2.196363
std          0.512911
min          0.780000
25%          1.840000
50%          2.100000
75%          2.500000
max          7.000000
Name: Resolution, dtype: float64

Encode the experimental method and normalize the numerical features/labels

In [20]:
le = LabelEncoder()
dataset["Experimental"] = le.fit_transform(dataset["Experimental"])

scaler = StandardScaler()
num_cols = dataset.select_dtypes(include=["int64", "float64"]).columns
dataset[num_cols] = scaler.fit_transform(dataset[num_cols])
dataset.head()

Unnamed: 0,ID,Unaligned Sequence,A,R,N,D,C,E,Q,G,...,Total Gaps in Alignment,Average Gap Length,Sequence Length_y,Gap Count,Percentage Gaps,Mutations from Consensus,Experimental,Resolution,R Value,R Free
1,2BHP_A,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,1.070641,0.639525,-0.545333,-1.022433,-0.683303,1.175218,-0.731946,0.419809,...,0.0,0.0,0.0,-1.71592,-1.71592,-3.19644,0.017942,-0.772804,-1.752506,-1.878857
2,2BHP_B,MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEW...,1.088647,0.653363,-0.538923,-1.015828,-0.683303,1.194407,-0.726918,0.434237,...,0.0,0.0,0.0,-1.696788,-1.696788,-3.19644,0.017942,-0.772804,-1.752506,-1.878857
4,2DXB_A,VWDRTHHAKMATGIGDPQCFKGMAGKSKFNVGDRVRIKDLPDLFYT...,0.153642,0.219002,-0.284639,0.774505,-0.285918,0.694351,-0.844728,-0.139826,...,0.0,0.0,0.0,0.796739,0.796739,0.201799,0.017942,0.104579,-1.028602,-1.287538
5,2DXB_B,SSIREEVHRHLGTVALMQPALHQQTHAPAPTEITHTLFRAYTRVPH...,0.007359,0.856607,-0.847882,-1.13647,0.242185,1.248655,0.42647,-0.195726,...,0.0,0.0,0.0,0.579911,0.579911,0.201799,0.017942,0.104579,-1.028602,-1.287538
6,2DXB_C,EVSDFEILEMAVRELAIEKGLFSAEDHRVWKDYVHTLGPLPAARLV...,-0.106895,0.860584,-0.522932,-0.185982,0.189095,-0.003758,-0.16295,-0.603091,...,0.0,0.0,0.0,0.17814,0.17814,0.201799,0.017942,0.104579,-1.028602,-1.287538


In [21]:
dataset.dtypes

ID                                        object
Unaligned Sequence                        object
A                                        float64
R                                        float64
N                                        float64
D                                        float64
C                                        float64
E                                        float64
Q                                        float64
G                                        float64
H                                        float64
I                                        float64
L                                        float64
K                                        float64
M                                        float64
F                                        float64
P                                        float64
S                                        float64
T                                        float64
W                                        float64
Y                   