In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
! pip install deepchem
! pip install propy3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Creating the model

In [9]:
#full dataset
import pandas as pd
data = pd.read_csv("/content/drive/Shareddrives/2:1 Caitlin & Kimai/Data/Enzyme Stability Prediction/train.csv")

In [10]:
from propy import PyPro

In [6]:
import numpy as np 
aacd = [] #Amino Acid Composition Descriptors(20)
ctdd = [] #Composition Transition Distribution descriptors
for seq in data['protein_sequence'].to_list():
    Obj = PyPro.GetProDes(seq)
    aacd.append(np.array(list(Obj.GetAAComp().values())))
    ctdd.append(np.array(list(Obj.GetCTD().values())))

label = data['tm'].values

In [11]:
#creating a dataframe
array1 = np.array(aacd)
df = pd.DataFrame(array1, columns = ["column {}".format(i) for i in range(1,21)])
df['label'] = label
df.head()

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8,column 9,column 10,...,column 12,column 13,column 14,column 15,column 16,column 17,column 18,column 19,column 20,label
0,13.196,7.331,1.466,3.812,0.293,8.798,1.76,11.144,0.88,4.106,...,4.692,2.346,3.812,5.279,3.226,4.106,1.173,0.88,10.85,75.7
1,9.79,10.49,2.098,3.497,0.0,18.182,7.692,6.294,1.399,4.545,...,6.643,0.699,2.098,2.797,4.895,4.196,1.049,1.049,4.545,50.5
2,10.06,6.237,3.018,5.433,1.811,6.439,5.03,13.078,2.213,3.219,...,7.847,1.207,4.225,4.024,6.64,6.036,0.604,3.219,6.036,40.5
3,7.547,3.774,3.396,7.17,1.887,10.943,3.396,6.038,2.642,3.774,...,6.415,0.755,4.528,6.038,6.038,7.17,1.132,1.509,5.283,47.2
4,5.927,4.342,4.48,5.376,0.965,5.376,3.722,5.789,2.757,4.893,...,4.686,2.136,2.205,8.822,10.2,8.27,1.103,3.239,8.546,49.5


In [12]:
#train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
x_train, x_test, y_train, y_test = train_test_split(array1, label, test_size = 0.2, random_state = 0)

In [13]:
from sklearn.ensemble import RandomForestRegressor
#building the model
model = RandomForestRegressor(n_estimators = 73) # 50 - 9.11, 40 - 9.14, 30 - 9.15, 20 - 9.25, 10 - 9.44
#50 is the best
model.fit(x_train,y_train)

#predictions
y_pred = model.predict(x_test)

#evaluation
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: {}".format(RMSE))
print("Training Model Score: {}".format(r2_score(y_train, model.predict(x_train))))
print("Testing Model Score: {}".format(r2_score(y_test,y_pred)))

RMSE: 9.281805685725653
Training Model Score: 0.8583562843516293
Testing Model Score: 0.5358003844309163


In [14]:
#saving the model
import pickle

with open("Randomforrest_nestimators73.pkl",'wb') as file1:
  pickle.dump(model, file1)

### working with test data

In [15]:
#read the test data
import pandas as pd
test_data = pd.read_csv("/content/drive/Shareddrives/2:1 Caitlin & Kimai/Data/Enzyme Stability Prediction/test.csv")

In [16]:
import numpy as np 
aacd_test = [] #Amino Acid Composition Descriptors(20)
ctdd_test = [] #Composition Transition Distribution descriptors
for seq in test_data['protein_sequence'].to_list():
    Obj = PyPro.GetProDes(seq)
    aacd_test.append(np.array(list(Obj.GetAAComp().values())))
    ctdd_test.append(np.array(list(Obj.GetCTD().values())))

In [17]:
array2 = np.array(aacd_test)
#creating a dataframe
df = pd.DataFrame(array2, columns = ["column {}".format(i) for i in range(1,21)])
df.head()

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8,column 9,column 10,column 11,column 12,column 13,column 14,column 15,column 16,column 17,column 18,column 19,column 20
0,9.955,1.357,8.597,6.787,1.81,3.62,5.882,8.597,0.0,2.715,4.525,10.86,0.0,4.525,7.692,8.145,3.62,2.715,2.715,5.882
1,9.955,1.357,8.597,6.787,1.81,3.167,5.882,8.597,0.0,2.715,4.525,11.312,0.0,4.525,7.692,8.145,3.62,2.715,2.715,5.882
2,10.0,1.364,8.636,6.818,1.818,3.182,5.909,8.636,0.0,2.727,4.545,10.909,0.0,4.545,7.727,8.182,3.636,2.727,2.727,5.909
3,9.955,1.357,8.597,6.787,2.262,3.167,5.882,8.597,0.0,2.715,4.977,10.407,0.0,4.525,7.692,8.145,3.62,2.715,2.715,5.882
4,9.955,1.357,8.597,6.787,1.81,3.167,5.882,8.597,0.0,2.715,4.977,10.407,0.0,4.977,7.692,8.145,3.62,2.715,2.715,5.882


In [18]:
#predictions
predictions = model.predict(array2)
predictions = ["{:.1f}".format(values) for values in predictions]
print(predictions)

['49.9', '51.5', '51.8', '50.9', '50.7', '50.7', '50.9', '50.7', '51.1', '51.3', '51.5', '49.9', '51.1', '51.8', '51.2', '51.1', '51.5', '51.0', '51.1', '51.1', '51.5', '50.1', '51.4', '49.7', '50.9', '50.9', '51.4', '49.7', '50.9', '51.0', '51.1', '50.9', '51.3', '50.9', '51.0', '51.0', '51.3', '49.9', '50.9', '51.1', '51.4', '49.7', '50.9', '50.8', '50.9', '51.1', '49.7', '51.0', '51.1', '50.9', '51.3', '50.9', '50.7', '51.0', '51.0', '51.3', '51.3', '49.9', '50.9', '51.4', '49.7', '50.9', '51.0', '51.1', '51.3', '50.9', '51.0', '51.0', '50.9', '51.1', '51.3', '49.7', '50.9', '51.0', '51.1', '51.0', '51.3', '49.9', '50.9', '50.9', '51.1', '50.9', '51.1', '50.9', '51.6', '50.9', '50.7', '51.0', '51.0', '50.7', '51.4', '50.9', '49.5', '50.9', '50.8', '50.9', '51.1', '51.1', '52.7', '53.2', '50.8', '50.8', '51.1', '49.5', '50.7', '51.4', '50.9', '50.7', '50.9', '51.1', '51.4', '49.7', '50.9', '50.9', '51.1', '51.4', '49.7', '50.9', '51.0', '51.1', '50.9', '51.3', '50.7', '51.0', '51.0',

In [19]:
test_data.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


In [20]:
predictions_data = test_data[['seq_id']]
predictions_data['tm'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictions_data['tm'] = predictions


In [21]:
predictions_data.head()

Unnamed: 0,seq_id,tm
0,31390,49.9
1,31391,51.5
2,31392,51.8
3,31393,50.9
4,31394,50.7


In [22]:
predictions_data.to_csv("/content/drive/Shareddrives/2:1 Caitlin & Kimai/Data/Enzyme Stability Prediction/Submission/submission1.csv", index = False)
print("CSV is saved")

CSV is saved
