In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor
from torch.utils.data import TensorDataset,DataLoader

In [2]:
#import necessary libraries
from sklearn.metrics import r2_score
from scipy.stats import pearsonr,spearmanr
from sklearn.metrics import mean_squared_error,mean_absolute_error

#calculate the predicted scores and print it
def scores(actuall_labels, predicted_labels):
    print('Pearson Score: ',pearsonr(actuall_labels,predicted_labels))
    print('R2_Score: ',r2_score(actuall_labels,predicted_labels))
    print('Spearmanr Score: ',spearmanr(actuall_labels,predicted_labels))
    print('Mean Squared Error(MSE): ',mean_squared_error(actuall_labels,predicted_labels))
    print('Mean Absolute Error(MAE): ',mean_absolute_error(actuall_labels,predicted_labels))


In [3]:
train=pd.read_csv('../input/lcp-multi-feature/multi_train.csv',index_col=[0])
test=pd.read_csv('../input/lcp-multi-feature/multi_test.csv',index_col=[0])
val=pd.read_csv('../input/lcp-multi-feature/multi_trail.csv',index_col=[0])

In [4]:
train_x=train.drop(columns=['complexity'])
train_y=train['complexity']

In [5]:
test_x=test.drop(columns=['complexity'])
test_y=test['complexity']

In [6]:
val_x=val.drop(columns=['complexity'])
val_y=val['complexity']

In [7]:
trainx=np.array(train_x)
trainy=np.array(train_y)
testx=np.array(test_x)
testy=np.array(test_y)
valx=np.array(val_x)
valy=np.array(val_y)

In [8]:
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(random_state=0, n_estimators=100)
abr.fit(trainx, trainy)
pred1=abr.predict(testx)
val1=abr.predict(valx)
scores(testy,pred1)

Pearson Score:  (0.7857525506246894, 8.010006040798753e-40)
R2_Score:  0.6141590109126839
Spearmanr Score:  SpearmanrResult(correlation=0.7844574882661103, pvalue=1.2996833442457811e-39)
Mean Squared Error(MSE):  0.009313094936779034
Mean Absolute Error(MAE):  0.07639565038618813


In [9]:
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor(random_state=0, n_estimators=100)
br.fit(trainx, trainy)
pred2=br.predict(testx)
val2=br.predict(valx)
scores(testy,pred2)

Pearson Score:  (0.8002066447666721, 2.844455879285822e-42)
R2_Score:  0.6217578454219557
Spearmanr Score:  SpearmanrResult(correlation=0.7869642372532265, pvalue=5.0773369761426894e-40)
Mean Squared Error(MSE):  0.009129680864155179
Mean Absolute Error(MAE):  0.07657743483390425


In [10]:
from sklearn.ensemble import GradientBoostingRegressor
gbr =  GradientBoostingRegressor(random_state=19, n_estimators=100)
gbr.fit(trainx, trainy)
pred3=gbr.predict(testx)
val3=gbr.predict(valx)
scores(testy,pred3)

Pearson Score:  (0.8001754633706162, 2.880704797396699e-42)
R2_Score:  0.6333318216302948
Spearmanr Score:  SpearmanrResult(correlation=0.7886220281786126, pvalue=2.7079523630534867e-40)
Mean Squared Error(MSE):  0.008850318271084769
Mean Absolute Error(MAE):  0.07510577464393976


In [11]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha=0.0005)
lasso.fit(trainx, trainy)
pred4=lasso.predict(testx)
val4=lasso.predict(valx)
scores(testy,pred4)

Pearson Score:  (0.790162826863626, 1.5021176188217196e-40)
R2_Score:  0.6243194117701552
Spearmanr Score:  SpearmanrResult(correlation=0.7935289351942533, pvalue=4.0739717299350277e-41)
Mean Squared Error(MSE):  0.009067851998735593
Mean Absolute Error(MAE):  0.07676063447600108


In [12]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [80]:

class NN(nn.Module):
    def __init__(self,inp_size,l1_size,l2_size,l3_size):
        super(NN, self).__init__()
        self.layer1=nn.Linear(inp_size,l1_size)
        self.layer2=nn.Linear(l1_size,l2_size)
        self.layer3=nn.Linear(l2_size,l3_size)
        
        self.lin=nn.Linear(l3_size,1)
        self.dropout1=nn.Dropout(0.3)
        self.dropout2=nn.Dropout(0.2)
        self.relu=nn.ReLU()
        self.sigmoid=nn.Sigmoid()
        
        
    def forward(self,inp:Tensor)->Tensor:
        inp=self.layer1(inp)
        inp=self.sigmoid(inp)
        inp=self.dropout1(inp)
        inp=self.layer2(inp)
        inp=self.sigmoid(inp)
        inp=self.dropout2(inp)
        inp=self.layer3(inp)
        inp=self.sigmoid(inp)
        inp=self.lin(inp)
        inp=self.sigmoid(inp)
        return inp
        
        
    

In [110]:
NBATCH=32

In [111]:
train_losses = []
val_losses = []
def train_model(train,test,model,n_epochs):
    optimizer=torch.optim.Adam(model.parameters(),lr=0.0001)
    criterion=nn.MSELoss()
    train_loader = DataLoader(train, batch_size=NBATCH, shuffle=True, drop_last=True, pin_memory=True, num_workers=2)
    val_loader = DataLoader(test, batch_size=NBATCH, shuffle=False, drop_last=False, num_workers=2)
    trainSteps = len(train_loader.dataset) // NBATCH
    valSteps = len(val_loader.dataset) // NBATCH
    for epoch in range(1,n_epochs+1):
        totalTrainLoss = 0
        totalValLoss = 0
        model.train()
        for batch,(data,target) in enumerate(train_loader,1):
            optimizer.zero_grad()
            pred=model(data)
            pred=torch.flatten(pred)
            loss=criterion(pred,target)
            loss.backward()
            totalTrainLoss+=loss
            optimizer.step()
        
        model.eval()
        for batch,(data,target) in enumerate(val_loader,1):

            with torch.no_grad():
                pred=model(data)
                pred=torch.flatten(pred)
                loss=criterion(pred,target)
                
                totalValLoss+=loss
                
        avgTrainLoss = totalTrainLoss / trainSteps
        avgValLoss = totalValLoss / valSteps
        train_losses.append(avgTrainLoss.cpu().detach().numpy())
        val_losses.append(avgValLoss.cpu().detach().numpy())
        print("EPOCH: {}/{}".format(epoch, n_epochs))
        print("train loss: {:.5f}, val loss: {:.5f}".format(avgTrainLoss, avgValLoss))
        

In [16]:
trainx.shape

(1517, 1850)

In [112]:
model=NN(trainx.shape[1],32,32,16)

In [18]:
train_tx = torch.Tensor(trainx) 
train_ty = torch.Tensor(trainy)

train_t = TensorDataset(train_tx,train_ty) 

In [19]:
val_tx = torch.Tensor(valx) 
val_ty = torch.Tensor(valy)

val_t = TensorDataset(val_tx,val_ty)

In [113]:
train_model(train_t,val_t,model,100)

EPOCH: 1/100
train loss: 0.02472, val loss: 0.03587
EPOCH: 2/100
train loss: 0.02405, val loss: 0.03636
EPOCH: 3/100
train loss: 0.02380, val loss: 0.03665
EPOCH: 4/100
train loss: 0.02356, val loss: 0.03663
EPOCH: 5/100
train loss: 0.02301, val loss: 0.03654
EPOCH: 6/100
train loss: 0.02269, val loss: 0.03627
EPOCH: 7/100
train loss: 0.02232, val loss: 0.03585
EPOCH: 8/100
train loss: 0.02192, val loss: 0.03548
EPOCH: 9/100
train loss: 0.02135, val loss: 0.03439
EPOCH: 10/100
train loss: 0.02082, val loss: 0.03331
EPOCH: 11/100
train loss: 0.02004, val loss: 0.03204
EPOCH: 12/100
train loss: 0.01907, val loss: 0.03111
EPOCH: 13/100
train loss: 0.01852, val loss: 0.02950
EPOCH: 14/100
train loss: 0.01720, val loss: 0.02807
EPOCH: 15/100
train loss: 0.01653, val loss: 0.02659
EPOCH: 16/100
train loss: 0.01576, val loss: 0.02515
EPOCH: 17/100
train loss: 0.01473, val loss: 0.02456
EPOCH: 18/100
train loss: 0.01415, val loss: 0.02344
EPOCH: 19/100
train loss: 0.01341, val loss: 0.02185
EP

In [114]:
pred5=model(torch.Tensor(testx))
pred5=torch.flatten(pred5).cpu().detach().numpy()
val5=model(torch.Tensor(valx))
val5=torch.flatten(val5).cpu().detach().numpy()
scores(testy,pred5)

Pearson Score:  (0.7845757810187549, 1.2436479396840229e-39)
R2_Score:  0.6146995805324106
Spearmanr Score:  SpearmanrResult(correlation=0.7829550996405258, pvalue=2.2692518986871505e-39)
Mean Squared Error(MSE):  0.00930004713643941
Mean Absolute Error(MAE):  0.07588307942518034


In [115]:
####Stacking###
from sklearn.linear_model import LinearRegression
stacked_model1=LinearRegression()
stacked_val=np.column_stack((val1,val2,val3,val4,val5))
stacked_model1.fit(stacked_val,valy)

stacked_pred1=np.column_stack((pred1,pred2,pred3,pred4,pred5))
pred6=stacked_model1.predict(stacked_pred1)
scores(testy,pred6)

Pearson Score:  (0.8080137145195762, 1.110678434174155e-43)
R2_Score:  0.6102748077801308
Spearmanr Score:  SpearmanrResult(correlation=0.791840319568337, pvalue=7.863314314566383e-41)
Mean Squared Error(MSE):  0.009406848461029448
Mean Absolute Error(MAE):  0.07750095634115964


In [122]:
####Stacking###
from sklearn.neighbors import KNeighborsRegressor
stacked_model2=KNeighborsRegressor(n_neighbors=20)
stacked_val=np.column_stack((val1,val2,val3,val4,val5))
stacked_model2.fit(stacked_val,valy)

stacked_pred2=np.column_stack((pred1,pred2,pred3,pred4,pred5))
pred7=stacked_model2.predict(stacked_pred2)
scores(testy,pred7)

Pearson Score:  (0.8138433604581143, 8.935072140946257e-45)
R2_Score:  0.6269868313986258
Spearmanr Score:  SpearmanrResult(correlation=0.817654460564173, pvalue=1.638944985219823e-45)
Mean Squared Error(MSE):  0.009003468138703155
Mean Absolute Error(MAE):  0.0758057092541156


In [125]:
test_df=pd.read_csv('../input/lcp-test/lcp_multi_test.tsv',encoding='utf-8', delimiter='\t', quotechar='\t', keep_default_na=False)
my_submission = pd.DataFrame({'id': test_df['id'], 'label': pred7})
my_submission.to_csv('lcp_task2.csv', header=False,index=False)