In [153]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import sys
from sklearn import utils
from rgf.sklearn import RGFRegressor

#Data Tree, Not a Binary Tree 
class Tree(object):
    def __init__(self, value=None):
        self.left = None
        self.right = None
        self.middle=None
        self.data = None
        self.value = value

# Tree print architecture from https://vallentin.dev/2016/11/29/pretty-print-tree
def print_tree(node, file=None, prefix="", _last=True):
    print(prefix, "`- " if _last else "|-", node.data, " ", (node.value), sep="", file=file)
    prefix += "   " if _last else "|"
    arr=[]
    if node.left!=None:
        arr.append(node.left)
    if node.middle!=None:
        arr.append(node.middle)
    if node.right!=None:
        arr.append(node.right)
    count = len(arr)
    for i, child in enumerate(arr):
        _last = i == (count - 1)
        pprint_tree(child, file, prefix, _last)
        
def GreedyForrest(file, feat, ytraindata):
    #train test split 60/40 
    traind = pd.read_csv(file)
    traind=traind[4:10]
    testd  = pd.read_csv(file)
    testd=testd[0:4]
    #solution   = pd.read_csv('../input/submission-solution/submission_solution.csv')
    features = feat
    X_train       = traind[features]
    y_train       = traind[ytraindata]
    final_X_test  = testd[features]
    #y_true        = solution["NKT cells"]
    #preprocessing
    X_train      = X_train.fillna(X_train.mean())
    final_X_test = final_X_test.fillna(final_X_test.mean())
    #regression
    regressor = RGFRegressor(max_leaf=30, algorithm="RGF_Opt", test_interval=100, loss="LS")
    #fit
    regressor.fit(X_train, y_train)
    #predict
    y_pred = regressor.predict(final_X_test)
    return(y_pred)

# Lasso Regression from Geeks for Geeks. Relatively simple Algorithm, didn't want to take time to code from scratch, so just used this implementation and tuned the params as needed 
class LassoRegression() :
    def __init__( self, learning_rate, iterations, l1_penality ) :
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.l1_penality = l1_penality

    # Function for model training
    def fit( self, X, Y ) :
        # no_of_training_examples, no_of_features
        self.m, self.n = X.shape  
        # weight initialization
        self.W = np.zeros( self.n )
        self.b = 0
        self.X = X
        self.Y = Y
        # gradient descent learning
        for i in range( self.iterations ) :
            self.update_weights()
        return self

# Helper function to update weights in gradient descent
    def update_weights( self ) :
        Y_pred = self.predict( self.X )
        # calculate gradients
        dW = np.zeros( self.n )
        for j in range( self.n ) :
            if self.W[j] > 0 :
                dW[j] = ( - ( 2 * ( self.X[:, j] ).dot( self.Y - Y_pred ) )
                        + self.l1_penality ) / self.m
            else :
                dW[j] = ( - ( 2 * ( self.X[:, j] ).dot( self.Y - Y_pred ) )
                        - self.l1_penality ) / self.m

        db = - 2 * np.sum( self.Y - Y_pred ) / self.m
        # update weights
        self.W = self.W - self.learning_rate * dW
        self.b = self.b - self.learning_rate * db
        return self
    
    # Hypothetical function h( x )
    def predict( self, X ) :
        return X.dot( self.W ) + self.b

        
def main():
    #Building a Tree with the 6 Immune Cell Types we have Deconvolutioon Data on 
    root = Tree()
    root.data = "Immune Cell"
    #root.value=1
    root.left = Tree()
    #root.left.value=1
    root.left.data = "Myeloid"
    root.right = Tree()
    #root.right.value=1
    root.right.data = "Lymphoid"
    root.left.left = Tree()
    root.left.left.data = "Monocytes"
    root.right.right = Tree()
    root.right.right.data = "NK Cell"
    root.right.left = Tree()
    #root.right.left.value=1
    root.right.left.data = "T Cell"
    root.right.middle = Tree()
    root.right.middle.data = "B Cell"
    root.right.left.left = Tree()
    root.right.left.left.data = "NKT Cell"
    root.right.left.middle = Tree()
    root.right.left.middle.data = "CD4-T Cell"
    root.right.left.right = Tree()
    root.right.left.right.data = "CD8-T Cell"
    
    #Fill with initial values Before Doing Tree Structured Regression
    #Using The Normalized Values over 10 different granularities (see cibersortx output and )
    #Let all other Values be 1
    df = pd.read_csv( "deconvolution_data.csv" )
    X= df.iloc[:][:].values
  

    mean=[]
    for i in range(6):
        mean.append(np.mean([row[i] for row in X]))
    norm= mean / sum(mean)
    
    root.left.left.value = norm[1]
    root.right.right.value = norm[5]
    root.right.middle.value = norm[4]
    root.right.left.left.value = norm[3]
    root.right.left.middle.value = norm[2]
    root.right.left.right.value = norm[0]
    print("Tree Before Proportions Regularization: ")
    print(" ")
    print_tree(root, file=None, prefix="", _last=False)
    
    # Now use Regression to Regularize Proportions of Cell Types near by in tree to b e similar. 
    #For example, The Types of T cells. Obvious that Before Regularization, the proportions are verywidespread
    # Important Because Results to Deconvolution Depend on the Granularity at Which you do the Deconvolution  
    #Using Both Greedy Forrest Algorithm And Classic Lasoo Regression Algorithm, Choosing Better Regression 
    
    # Importing dataset for Lasso Regression
    #Using CD4 to Predict NKT
    df = pd.read_csv( "deconvolution_data.csv" )
    vals=df.iloc[:].values  
    Xx = vals[:,2:3] #CD4
    Yy = df.iloc[:, 3].values #CD8
    X_train, X_test, Y_train, Y_test = train_test_split( Xx, Yy, test_size = 1/4, random_state = 0 )
    model = LassoRegression( iterations = 10000, learning_rate = 0.0005, l1_penality = 500 )
    model.fit( X_train, Y_train )
    Y_pred =  np.mean(model.predict( X_test ))
    
    #NKT
    file='deconvolution_data.csv'
    feat=['T cells CD8', 'T cells CD8']
    ytraindata="NKT cells"
    resultsNKT= np.mean(GreedyForrest(file, feat, ytraindata))
    print(" ")
    print("Regularized Greedy Forrest NKT: ", resultsNKT)
    print("Lasso Regression to Predict NKT: ", Y_pred)
    
    # Importing dataset for Lasso Regression
    #Using CD8 to Predict CD4
    df = pd.read_csv( "deconvolution_data.csv" )
    vals=df.iloc[:].values  
    Xx = vals[:,:1] #CD4
    Yy = df.iloc[:, 2].values #CD8
    X_train, X_test, Y_train, Y_test = train_test_split( Xx, Yy, test_size = 1/4, random_state = 0 )
    model = LassoRegression( iterations = 10000, learning_rate = 0.0005, l1_penality = 500 )
    model.fit( X_train, Y_train )
    Y_pred =  abs(np.mean(model.predict( X_test )))
    NKT=resultsNKT
    
    #CD4
    file='deconvolution_data.csv'
    feat=['NKT cells', 'T cells CD8']
    ytraindata="T cells CD4"
    resultsCD4= np.mean(GreedyForrest(file, feat, ytraindata))
    print(" ")
    print("Regularized Greedy Forrest CD4: ", resultsCD4)
    print("Lasso Regression to Predict CD4: ", Y_pred)
    CD4=resultsCD4
    
    # Importing dataset for Lasso Regression
    #Using CD4 to Predict CD8
    df = pd.read_csv( "deconvolution_data.csv" )
    vals=df.iloc[:].values  
    Xx = vals[:,2:3] #CD4
    Yy = df.iloc[:, 0].values #CD8
    X_train, X_test, Y_train, Y_test = train_test_split( Xx, Yy, test_size = 1/4, random_state = 0 )
    model = LassoRegression( iterations = 10000, learning_rate = 0.0005, l1_penality = 500 )
    model.fit( X_train, Y_train )
    Y_pred = np.mean(model.predict( X_test ))
    
    #CD8
    file='deconvolution_data.csv'
    feat=['NKT cells', 'T cells CD4']
    ytraindata="T cells CD8"
    resultsCD8= np.mean(GreedyForrest(file, feat, ytraindata))
    print(" ")
    print("Regularized Greedy Forrest CD8: ", resultsCD8)
    print("Lasso Regression to Predict CD8: ", Y_pred)
    CD8= resultsCD8
    
    
    #Update Tree 
    root.right.left.left.value = NKT
    root.right.left.middle.value = CD4
    root.right.left.right.value = CD8
    print(" ")
    print("Tree After Proportions Regularization: ")
    print(" ")
    print_tree(root, file=None, prefix="", _last=False)
    
if __name__ == '__main__':
    main()

Tree Before Proportions Regularization: 
 
|-Immune Cell None
||-Myeloid None
||`- Monocytes 0.40636631456385597
|`- Lymphoid None
|   |-T Cell None
|   ||-NKT Cell 0.0007721813986301658
|   ||-CD4-T Cell 0.33642326994971544
|   |`- CD8-T Cell 0.1200836638888051
|   |-B Cell 0.04920356953670102
|   `- NK Cell 0.08715100066229244
 
Regularized Greedy Forrest NKT:  0.0012806403
Lasso Regression to Predict NKT:  -0.006178635700216736
 
Regularized Greedy Forrest CD4:  0.33196513
Lasso Regression to Predict CD4:  0.38655008268648894
 
Regularized Greedy Forrest CD8:  0.15198771
Lasso Regression to Predict CD8:  0.10532909440076205
 
Tree After Proportions Regularization: 
 
|-Immune Cell None
||-Myeloid None
||`- Monocytes 0.40636631456385597
|`- Lymphoid None
|   |-T Cell None
|   ||-NKT Cell 0.0012806403
|   ||-CD4-T Cell 0.33196513
|   |`- CD8-T Cell 0.15198771
|   |-B Cell 0.04920356953670102
|   `- NK Cell 0.08715100066229244
