In [1]:
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [2]:
# Open the JSON file
with open('algoparams_from_ui.json') as f:
    # Load the JSON data from the file
    data = json.load(f)


In [3]:
iris = pd.read_csv('iris.csv')

In [30]:
data['design_state_data']['target']

{'prediction_type': 'Regression',
 'target': 'petal_width',
 'type': 'regression',
 'partitioning': True}

In [31]:
features = data['design_state_data']['feature_handling']
prediction_type = data['design_state_data']['target']['prediction_type']

In [32]:
#target = data['design_state_data']['target']['target']
df = pd.read_csv(r'C:\Me\GIM\clubs stuff\placecom\SIP\dendrite.ai\Screening Test - DS\iris.csv')

In [33]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [34]:
# Apply missing value imputation to the dataframe
missing_imputation = data['design_state_data']['feature_handling']
if missing_imputation == 'mean':
    imputer = SimpleImputer(strategy='mean')
elif missing_imputation == 'median':
    imputer = SimpleImputer(strategy='median')
elif missing_imputation == 'most_frequent':
    imputer = SimpleImputer(strategy='most_frequent')
else:
    imputer = None
if imputer is not None:
    df[features] = imputer.fit_transform(df)

In [35]:
feature_reduction = data['design_state_data']['feature_reduction']
if feature_reduction == 'corr':
    corr = df.corr()
    corr_target = corr[target].sort_values(ascending=False)
    selected_features = corr_target[corr_target.abs() > 0.5].index.tolist()
elif feature_reduction == 'tree':
    model = RandomForestRegressor()
    model.fit(df, df[target])
    importances = model.feature_importances_
    selected_features = df.columns[importances > 0.01].tolist()
elif feature_reduction == 'pca':
    pca = PCA(n_components=3)
    df_pca = pca.fit_transform(df)
    selected_features = ['PCA'+str(i) for i in range(pca.n_components_)]
else:
    selected_features = features


In [38]:
# Create the model objects based on the prediction type specified
prediction_type = data['design_state_data']['target']['prediction_type']
if prediction_type == 'linear':
    models = [LinearRegression()]
elif prediction_type == 'ridge':
    models = [Ridge()]
elif prediction_type == 'lasso':
    models = [Lasso()]
elif prediction_type == 'svm':
    models = [SVR()]
elif prediction_type == 'ensemble':
    models = [RandomForestRegressor()]
else:
    models = []

In [39]:
# Run fit and predict on each model with hyper parameter tuning using GridSearchCV
for model in models:
    param_grid = data['design_state_data']['hyperparameters']
    grid_search = GridSearchCV(model, param_grid, cv=5)
    X = iris[selected_features]
    y = iris[target]
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    grid_search.fit(X, y)
    print(grid_search.best_estimator_.predict(X))