In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error

In [18]:
#1) Read the target and type of regression to be run

In [19]:
json_file_path = "/content/algo.json"
with open(json_file_path, "r") as json_file:
    json_content = json_file.read()

try:
    parsed_json = json.loads(json_content)
    print(parsed_json)
except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)


target_info = parsed_json["design_state_data"]["target"]
target = target_info["target"]
regression_type = target_info["type"]
print("Target:", target)
print("Type of Regression:", regression_type)

{'session_name': 'test', 'session_description': 'test', 'design_state_data': {'session_info': {'project_id': '1', 'experiment_id': 'kkkk-11', 'dataset': 'iris_modified.csv', 'session_name': 'test', 'session_description': 'test'}, 'target': {'prediction_type': 'Regression', 'target': 'petal_width', 'type': 'regression', 'partitioning': True}, 'train': {'policy': 'Split the dataset', 'time_variable': 'sepal_length', 'sampling_method': 'No sampling(whole data)', 'split': 'Randomly', 'k_fold': False, 'train_ratio': 0, 'random_seed': 0}, 'metrics': {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}, 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length', 'is_selected': True, 'feature_variable_type': 'nu

In [20]:
# Q2. Read the features and figure out what missing imputation needs to be applied and apply that to the columns loaded in a dataframe.
feature_handling_info = parsed_json["design_state_data"]["feature_handling"]
feature_handling_info

{'sepal_length': {'feature_name': 'sepal_length',
  'is_selected': True,
  'feature_variable_type': 'numerical',
  'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
   'rescaling': 'No rescaling',
   'make_derived_feats': False,
   'missing_values': 'Impute',
   'impute_with': 'Average of values',
   'impute_value': 0}},
 'sepal_width': {'feature_name': 'sepal_width',
  'is_selected': True,
  'feature_variable_type': 'numerical',
  'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
   'rescaling': 'No rescaling',
   'make_derived_feats': False,
   'missing_values': 'Impute',
   'impute_with': 'custom',
   'impute_value': -1}},
 'petal_length': {'feature_name': 'petal_length',
  'is_selected': True,
  'feature_variable_type': 'numerical',
  'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
   'rescaling': 'No rescaling',
   'make_derived_feats': False,
   'missing_values': 'Impute',
   'impute_with': '

In [21]:
df=pd.read_csv("/content/iris.csv")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [22]:
#Q2. Compute feature reduction based on input. See the screenshot below where there can be No_Reduction, Corr with Target, Tree-based, PCA.
#Please make sure you write code so that all options can work. If we rerun your code with a different Json it should work if we switch No
# Reduction to say PCA.

for feature_name, feature_details in feature_handling_info.items():
    if feature_name in df.columns:
        feature_details = feature_details["feature_details"]

        if "missing_values" in feature_details and feature_details["missing_values"] == "Impute":
            impute_value = feature_details["impute_with"]

            if impute_value == "Average of values":
                df[feature_name].fillna(df[feature_name].mean(), inplace=True)
            elif impute_value == "custom":
                custom_impute_value = feature_details["impute_value"]
                df[feature_name].fillna(custom_impute_value, inplace=True)
print(df)

     sepal_length  sepal_width  petal_length  petal_width         species
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]


In [49]:
feature_reduction_info = parsed_json.get("feature_reduction", {})

feature_reduction_method = feature_reduction_info.get("feature_reduction_method", "No Reduction")
parsed_json = json.loads(json_content)
print("Parsed JSON:", parsed_json)

target_info = parsed_json["design_state_data"]["target"]
print("Target Info:", target_info)
target_feature = target_info["target"]
print("Extracted Target Feature:", target_feature)

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['encoded_target'] = label_encoder.fit_transform(df[target_feature])
#df['target_feature'] = label_encoder.fit_transform(df[target_feature])
numeric_df = df.drop(columns=[target_feature] if target_feature else [])
if feature_reduction_method == "No Reduction":
    pass

elif feature_reduction_method == "Correlation with Target":
    num_of_features_to_keep = parsed_json["feature_reduction"]["num_of_features_to_keep"]
    correlations = numeric_df.corrwith(df[target_feature])
    selected_features = correlations.abs().sort_values(ascending=False).index[:num_of_features_to_keep]
    numeric_df = numeric_df[selected_features]

elif feature_reduction_method == "Tree-based":
    num_of_features_to_keep = parsed_json["feature_reduction"]["num_of_features_to_keep"]
    model = ExtraTreesClassifier(n_estimators=100, random_state=0)
    model.fit(numeric_df, df[target_feature])
    sfm = SelectFromModel(model, prefit=True)
    selected_features = numeric_df.columns[sfm.get_support()]
    numeric_df = numeric_df[selected_features]

elif feature_reduction_method == "PCA":
    num_of_components = parsed_json["feature_reduction"]["num_of_features_to_keep"]
    pca = PCA(n_components=num_of_components)
    reduced_features = pca.fit_transform(numeric_df)
    numeric_df = pd.DataFrame(reduced_features, columns=[f"PCA_Component{i}" for i in range(1, num_of_components + 1)])

else:
    print("Unknown feature reduction method:", feature_reduction_method)
for column in numeric_df.select_dtypes(include=['object']).columns:
    numeric_df[column] = label_encoder.fit_transform(numeric_df[column])

print(numeric_df)


Parsed JSON: {'session_name': 'test', 'session_description': 'test', 'design_state_data': {'session_info': {'project_id': '1', 'experiment_id': 'kkkk-11', 'dataset': 'iris_modified.csv', 'session_name': 'test', 'session_description': 'test'}, 'target': {'prediction_type': 'Regression', 'target': 'petal_width', 'type': 'regression', 'partitioning': True}, 'train': {'policy': 'Split the dataset', 'time_variable': 'sepal_length', 'sampling_method': 'No sampling(whole data)', 'split': 'Randomly', 'k_fold': False, 'train_ratio': 0, 'random_seed': 0}, 'metrics': {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}, 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length', 'is_selected': True, 'feature_variab

In [50]:
#Q4. Parse the Json and make the model objects (using sklean) that can handle what is required in the “prediction_type” specified in the JSON
# (See #1 where “prediction_type” is specified). Keep in mind not to pick models that don’t apply for the prediction_type specified

prediction_type = parsed_json["design_state_data"]["target"]["prediction_type"]
model_objects = []
if prediction_type == "Regression":
    model_objects = [
        LinearRegression(),
        RandomForestRegressor(),
        SVR(),
        MLPRegressor(),
        KNeighborsRegressor(),
        DecisionTreeRegressor()
    ]
elif prediction_type == "Classification":
    model_objects = [
        LogisticRegression(),
        RandomForestClassifier(),
        SVC(),
        MLPClassifier(),
        KNeighborsClassifier(),
        DecisionTreeClassifier()
    ]
else:
    print("Unknown prediction type:", prediction_type)

In [51]:
model_objects

[LinearRegression(),
 RandomForestRegressor(),
 SVR(),
 MLPRegressor(),
 KNeighborsRegressor(),
 DecisionTreeRegressor()]

In [52]:
selected_features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[selected_features].values
y = df['petal_width'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
#Q5. Run the fit and predict on each model – keep in mind that you need to do hyper parameter tuning i.e., use GridSearchCV
for model in model_objects:
    model_name = model.__class__.__name__
    print(f"Running {model_name}...")

    param_grid = param_grids.get(model_name, {})

    if prediction_type == "Regression":
        scoring = 'neg_mean_squared_error'
    elif prediction_type == "Classification":
        scoring = 'accuracy'
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scoring)
    grid_search.fit(numeric_df, df['encoded_target'])
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(numeric_df)
    if prediction_type == "Regression":
        mse = mean_squared_error(df[target_feature], predictions)
        print(f"Mean Squared Error for {model_name}: {mse}")
    elif prediction_type == "Classification":
        accuracy = accuracy_score(df['target_feature'], predictions)
        print(f"Accuracy for {model_name}: {accuracy}")
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print("\n")

Running LinearRegression...
Mean Squared Error for LinearRegression: 92.32999999999998
Best parameters for LinearRegression: {}


Running RandomForestRegressor...
Mean Squared Error for RandomForestRegressor: 92.1552755
Best parameters for RandomForestRegressor: {'max_depth': None, 'n_estimators': 200}


Running SVR...
Mean Squared Error for SVR: 92.36199178442267
Best parameters for SVR: {'C': 0.1, 'kernel': 'linear'}


Running MLPRegressor...




Mean Squared Error for MLPRegressor: 92.33196602938627
Best parameters for MLPRegressor: {'alpha': 0.001, 'hidden_layer_sizes': (50,)}


Running KNeighborsRegressor...
Mean Squared Error for KNeighborsRegressor: 92.32999999999998
Best parameters for KNeighborsRegressor: {'n_neighbors': 3, 'weights': 'distance'}


Running DecisionTreeRegressor...
Mean Squared Error for DecisionTreeRegressor: 92.32999999999998
Best parameters for DecisionTreeRegressor: {'max_depth': 20}


