In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from irt import IRTModel
from sklearn import svm
from sklearn.linear_model import SGDRegressor, LinearRegression, BayesianRidge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from beta_irt.visualization.plots import newline
from beta_irt.visualization.plots import plot_parameters
from irt import beta_irt
from sklearn.decomposition import PCA
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from matplotlib import gridspec
from sklearn.preprocessing import StandardScaler
import edward as ed
import glob
import time, sys
from IPython.display import clear_output

In [3]:
def update_progress(progress, load = 'Progress'):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))

#     clear_output(wait = True)
    text = load + ": [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text, end = '\r')

In [4]:
##############################CREATING ALL REGRESSORS##############################

models = [LinearRegression(), BayesianRidge()]
names = list(map(lambda x: type(x).__name__, models))

C=[1,2,3,4,5]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
for krn in kernel:
    for c in C:
        models.append(svm.SVR(kernel= krn, C=c))
        names.append('SVR_'+krn[:3] + '_' +str(c))

for k in range(2,16):
    models.append(KNeighborsRegressor(n_neighbors= k))
    names.append('KNR_{}'.format(str(k)))
    
max_depth = range(2,16)
for md in max_depth:
    models.append(DecisionTreeRegressor(max_depth = md))
    names.append('DT_{}'.format(str(md)))

n_estimator = range(20, 60, 10)
for md in range(3,12, 2):
    for n in n_estimator:
        models.append(RandomForestRegressor(n_estimators = n, max_depth = md))
        names.append('RF_e{}_md{}'.format(str(n), str(md)))

loss = ['linear', 'square', 'exponential']
for l in loss:
    for n in n_estimator:
        models.append(AdaBoostRegressor(n_estimators= n, loss=l))
        names.append('AdaB_loss{}_n{}'.format(l[:3], str(n)))

archs = [(5,), (10,), (15,), (20,), (30,), (40,), (50,), (60,)]
funcs = ['relu', 'logistic']
for f in funcs:
    for a in archs:
        models.append(MLPRegressor(hidden_layer_sizes=a, activation=f))
        names.append('MLP_hls{}_f{}' .format(str(a[0]), f[:3]))

names = names + ['Average', 'Optimal', 'Worst']
        
# Parameters
rd = 42

In [5]:
##############################READING ALL DATASETS##############################

selected = './data/SELECTED/'
dbs = glob.glob(selected + '*.csv')

for d, db in enumerate(dbs):
    print('\n------------------------------------------------------------\n')

    # Creating folders   
    name = db.split('/')[-1].split('.')[0]
    
    print('Data set ' + str(d + 1) + ' >>>> ' + name)
    
    if not os.path.isdir('./beta_irt/results/'+ name):
        os.system('mkdir ./beta_irt/results/'+ name)
    
    # Read file
    df = pd.read_csv(db, na_values=['?'])
    df = df.dropna()
    df = df.drop_duplicates()
    
    #Variable selection
    if df.shape[1] > 2:
        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values
        
        # Principal component analysis
#         pca = PCA(n_components= 1)
#         X_train = pca.fit_transform(X_train)
#         X_test = pca.transform(X_test)
    else: 
        X = df.iloc[:, 0].values.reshape(-1,1)
        y = df.iloc[:, -1].values
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = rd)
    
    # Standard scale
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    
    sc_y = StandardScaler()
    y_train = sc_y.fit_transform(y_train.reshape(-1,1)).reshape(1,-1)[0]
    y_test = sc_y.transform(y_test.reshape(-1,1)).reshape(1,-1)[0]
    
    print('> Training models')
    # Generate abilities/parameters for BIRT and other info.
    Irt = IRTModel(models= models)
    Irt.fit(X_train = X_train, y_train = y_train)

    # Folders
    path = './beta_irt/results/'
    folder = name + '/'

    #-------------------------------------Generate-BIRT-------------------------------------#

    responses = np.zeros((len(X_test), len(models) + 3))

    rep = 40
    for itr in range(rep):
        # Generate IRT matrix
        Irt.irtMatrix(X_test= X_test, y_test= y_test, normalize= True, base_models= True, name= name, rd= rd)
        responses += Irt.irt_matrix

        name_ = name + '_s' + str(len(y_test)) + '_f0_sd' + str(rd)


    responses /= rep

    
    # Move files to folder    
    output = './Results_IRT/IRT_data/'
    if not os.path.isdir(output):
        os.system('mkdir '+ output)
    if not os.path.isdir(output):
        os.system('mkdir ' + output)

    # RESPONSES
    irt_df = pd.DataFrame(data= responses)
    irt_df.columns = names
    irt_df.to_csv(output + name_ + '.csv', index=False)

    #-------------------------------------Clean-Files-------------------------------------#
    
    os.system('rm ./beta_irt/*.csv')
    os.system('rm ' +path + folder + '*.csv')
    


------------------------------------------------------------

Data set 1 >>>> disclosurez
> Training models

------------------------------------------------------------

Data set 2 >>>> poly5100
> Training models

------------------------------------------------------------

Data set 3 >>>> sin3101
> Training models

------------------------------------------------------------

Data set 4 >>>> bike_sharing_day
> Training models

------------------------------------------------------------

Data set 5 >>>> energy
> Training models

------------------------------------------------------------

Data set 6 >>>> HappinessRank2015
> Training models

------------------------------------------------------------

Data set 7 >>>> poly3101
> Training models

------------------------------------------------------------

Data set 8 >>>> bodyfat
> Training models

------------------------------------------------------------

Data set 9 >>>> sin1100
> Training models

------------------------------