In [18]:
import json
from striprtf.striprtf import rtf_to_text
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.metrics import accuracy_score,r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression,LogisticRegression, Ridge, Lasso

## parsing the given json file

In [2]:
with open("algoparams_from_ui.json.rtf") as infile:
    content = infile.read()
    text = rtf_to_text(content)

    json_data = json.loads(text)

In [3]:
json_data

{'session_name': 'test',
 'session_description': 'test',
 'design_state_data': {'session_info': {'project_id': '1',
   'experiment_id': 'kkkk-11',
   'dataset': 'iris_modified.csv',
   'session_name': 'test',
   'session_description': 'test'},
  'target': {'prediction_type': 'Regression',
   'target': 'petal_width',
   'type': 'regression',
   'partitioning': True},
  'train': {'policy': 'Split the dataset',
   'time_variable': 'sepal_length',
   'sampling_method': 'No sampling(whole data)',
   'split': 'Randomly',
   'k_fold': False,
   'train_ratio': 0,
   'random_seed': 0},
  'metrics': {'optomize_model_hyperparameters_for': 'AUC',
   'optimize_threshold_for': 'F1 Score',
   'compute_lift_at': 0,
   'cost_matrix_gain_for_true_prediction_true_result': 1,
   'cost_matrix_gain_for_true_prediction_false_result': 0,
   'cost_matrix_gain_for_false_prediction_true_result': 0,
   'cost_matrix_gain_for_false_prediction_false_result': 0},
  'feature_handling': {'sepal_length': {'feature_name'

## reading given csv file

In [4]:
df = pd.read_csv(r'C:\Users\USER\Downloads\Screening Test - DS\Screening Test - DS\iris.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

> ### 1.Read the target and type of regression to be run

In [7]:
target_feature = json_data['design_state_data']['target']['target']
prediction_type = json_data['design_state_data']['target']['prediction_type']

> ### 2) Read the features (which are column names in the csv) and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe

In [8]:
for feature,details in json_data['design_state_data']['feature_handling'].items():
    if not details['is_selected']:
        continue
    elif details['feature_variable_type'] == 'numerical':
        missing_values = details['feature_details']['missing_values']
        if missing_values == 'Impute':
            impute_with = details['feature_details']['impute_with']
            if impute_with == 'Average_of_values':
                df[feature].fillna(df[feature].mean(),inplace = True)
            elif impute_with == 'custom':
                impute_value = details['feature_details']['impute_value']
                df[feature].fillna(impute_value, inplace = True)
    elif details['feature_variable_type'] == 'text':
        dummies = pd.get_dummies(df[feature])
        df.drop(columns=[feature], inplace=True)
        df = df.join(dummies)

In [9]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Iris-setosa,Iris-versicolor,Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0


> ### 3) Compute feature reduction based on input and encode the text features.

In [10]:
feature_reduction_method = json_data['design_state_data']['feature_reduction']['feature_reduction_method']
num_of_featurestokeep = json_data['design_state_data']['feature_reduction']['num_of_features_to_keep']
feature_reduction_method

'Tree-based'

In [11]:
df.dtypes

sepal_length       float64
sepal_width        float64
petal_length       float64
petal_width        float64
Iris-setosa          uint8
Iris-versicolor      uint8
Iris-virginica       uint8
dtype: object

In [12]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Iris-setosa,Iris-versicolor,Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0


In [13]:
if feature_reduction_method == 'No reduction':
    pass
elif feature_reduction_method == 'correlation with target':
    corr_matrix = df.corr()
    relevant_features = corr_matrix[target_feature].sort_values(ascending = False)
    df = df[relevant_feature[:num_of_featurestokeep]]
elif feature_reduction_method == 'Tree-based':
    depth_of_trees = int(json_data['design_state_data']['feature_reduction']['depth_of_trees'])
    num_of_trees = int(json_data['design_state_data']['feature_reduction']['num_of_trees'])
    if prediction_type == 'Regression':
        tree_model = DecisionTreeRegressor(max_depth = depth_of_trees,random_state = 42)
    else:
        tree_model = DecisionTreeClassifier(max_depth = depth_of_trees,random_state = 42)
    tree_model.fit(df.drop(target_feature, axis=1), df[target_feature])
    importance = sorted(tree_model.feature_names_in_,key = lambda x: tree_model.\
                        feature_importances_[list(tree_model.feature_names_in_).index(x)])
    df = df[importance[:num_of_trees] + [target_feature]]
    
elif feature_reduction_method == 'Principal Component Analysis':
    pca = PCA(n_components = num_of_featurestokeep)
    df = pcs.fit_transform(df)

In [14]:
df.head()

Unnamed: 0,Iris-versicolor,petal_length,sepal_length,sepal_width,Iris-virginica,petal_width
0,0,1.4,5.1,3.5,0,0.2
1,0,1.4,4.9,3.0,0,0.2
2,0,1.3,4.7,3.2,0,0.2
3,0,1.5,4.6,3.1,0,0.2
4,0,1.4,5.0,3.6,0,0.2


> ### 4) Parse the Json and make the model objects (using sklean) 

### Train_test_split

In [15]:
x = df.drop(target_feature, axis = 1)
y = df[target_feature]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.33,random_state = 42)

### Make the model objects


In [19]:
if prediction_type == 'Regression':
    models = models = [LinearRegression, RandomForestRegressor]
else:
    models = [RandomForestClassifier,LogisticRegression,\
              ]
    


## param_grid  as per the parameters provided in the algorithms in json file 


In [20]:
paramgrid ={
    'RandomForestRegressor' : {             #for randomm_forest_regressor                                            
        'n_estimators': range(10, 21),      
        'max_depth': range(20, 26),
        'min_samples_leaf': range(5, 11)},
    
    'GBTClassifier' : {
        "n_estimators": [67, 89],
        "learning_rate": [0.1, 0.5],
        "max_depth": [5, 7],
        "min_samples_split": [1, 2],
        "min_samples_leaf": [0.1, 0.5]},
    
    'GBTRegressor' : {
        "n_estimators": [67, 89],
        "learning_rate": [0.1, 0.5],
        "max_depth": [5, 7],
        "min_samples_split": [1, 2],
        "min_samples_leaf": [0.1, 0.5]},
    
    'LinearRegression' : {
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "copy_X": [True, False],
        "n_jobs": [1, 2]},
    
    'LogisticRegression' : {
        "penalty": ["l1", "l2", "elasticnet"],
        "dual": [True, False],
        "tol": [1e-4, 1e-3, 1e-2],
        "C": [1.0, 2.0, 3.0],
        "fit_intercept": [True, False],
        "intercept_scaling": [1, 2],
        "class_weight": [None, "balanced"],
        "random_state": [None, 42],
        "solver": ["newton-cg", "lbfgs", "liblinear", "sag"],
        "max_iter": [100, 200],
        "multi_class": ["auto", "ovr", "multinomial"],
        "verbose": [0, 1],
        "warm_start": [True, False],
        "n_jobs": [1, 2]},
    
    'RidgeRegression' : {
        "alpha": [0.1, 0.5, 1.0],
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "copy_X": [True, False],
        "tol": [1e-4, 1e-3, 1e-2],
        "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag"],
        "max_iter": [100, 200],
        "random_state": [None, 42],
        "tol": [1e-4, 1e-3, 1e-2],},
        'LassoRegression' : {
        "alpha": [0.1, 0.5, 1.0],
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "precompute": [True, False],
        "copy_X": [True, False],
        "max_iter": [100, 200],
        "tol": [1e-4, 1e-3, 1e-2],
        "warm_start": [True, False],
        "positive": [True, False],
        "random_state": [None, 42],
        "selection": ["cyclic", "random"]}
}

## Creating the model, finding the best params using GSCV

In [23]:
for model,model_parameters in json_data['design_state_data']['algorithms'].items():
    if model_parameters['is_selected']:
        for i in models:
            if model.lower() in str(i).lower():
                gscv = GridSearchCV(estimator = i(), param_grid = paramgrid[model], 
                          cv = 6, n_jobs = 5, pre_dispatch='2*n_jobs', 
                          refit=True, return_train_score=True)
                gscv.fit(x_train, y_train)
                best_parameters = gscv.best_params_
                final_model = i(**best_parameters )
                print(f'our model with best paramneters is {final_model}')
                final_model.fit(x_train,y_train)
                y_preds = final_model.predict(x_test)
                if prediction_type == 'Regression':
                    R2score = r2_score(y_test,y_preds)
                    print(f'The r2score of {i} is {R2score}')
                else:
                    accuracy = accuracy_score(y_test,y_preds)
                    print(f'The accuracy of {i} is {accuracy}')

our model with best paramneters is RandomForestRegressor(max_depth=25, min_samples_leaf=5, n_estimators=20)
The r2score of <class 'sklearn.ensemble._forest.RandomForestRegressor'> is 0.9556946752624447


# The r2score of our model is 0.9540859614965108 for rand_for_regressor