In [62]:
import numpy as np # used for scientific computing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd # used for data analysis and manipulation
import matplotlib.pyplot as plt # used for visualization and plotting
import matplotlib.cm as cm
import scipy as sp
import pprint
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn import metrics
import dtreeviz.trees
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy import stats
from pprint import pprint



check for min coefficients
split to train/test by applicant, not randomally
validity of data (preprocessing)

In [63]:
df = pd.read_csv('inner_data.csv')
column_headers = list(df.columns.values)
data = df.to_numpy()
len(column_headers)
# data

84

In [64]:
def split_data_by_feature(df, feature):
    data = df.to_numpy()
    data = data[:, 1:]
    data = data[:, 1:]
    X = np.delete(data, feature, axis=1)
    y = data[:, feature]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def split_data_by_applicant(df, feature):
    applicants = df.Applicant.unique()
    applicants_train, applicants_test = train_test_split(applicants, test_size=0.2, random_state=42)
    data_train = df[df.Applicant.isin(applicants_train)].copy()
    data_test = df[df.Applicant.isin(applicants_test)].copy()
    data_train.drop(columns=['Question','Applicant'], inplace=True)
    data_test.drop(columns=['Question','Applicant'], inplace=True)

    data_train=data_train.to_numpy()
    data_test=data_test.to_numpy()

    X_train, X_test, y_train, y_test = np.delete(data_train, feature, axis=1),np.delete(data_test, feature, axis=1), data_train[:, feature],data_test[:, feature]
    return X_train, X_test, y_train, y_test


def linear_to_poly_data(X_train, X_test, degree):
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)
    poly_names = poly_features.get_feature_names_out()
    return X_train_poly, X_test_poly, poly_names
    

def provide_best_features(X_train_poly, X_test_poly, y_train, y_test, num_feat, thresh=0.1):
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    y_pred = model.predict(X_test_poly)
    
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", "{:.4f}".format(mse))
    
    pred_comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    pred_comparison_df["True/False"] = np.abs(pred_comparison_df["Actual"] - pred_comparison_df["Predicted"]) < thresh        
    print("Accuracy: ","{:.2f}".format(100*pred_comparison_df["True/False"].value_counts(normalize=True)[0]),"%")
    
    feature_coefficients = np.abs(model.coef_)
    top5_indices = np.argsort(feature_coefficients)[-num_feat:]
    return top5_indices.tolist(), mse, pred_comparison_df["True/False"].value_counts(normalize=True)[0]

In [83]:
def run_model(df, degree, thresh, num_feat, split_func):
    best_features = []
    best_features_to_predict = {}
    for feature in range(df.shape[1]-2):
        print("FEATURE TO PREDICT: ", feature)
        X_train, X_test, y_train, y_test = split_func(df, feature)
        X_train_poly, X_test_poly, poly_names = linear_to_poly_data(X_train, X_test, degree)
        model_eval = provide_best_features(X_train_poly, X_test_poly, y_train, y_test, num_feat)
        top_features = poly_names[model_eval[0]].tolist()
        best_features.append(model_eval[0])

        print("predict only with TOP 5 features:")
        X_train_only_best, X_test_only_best = X_train_poly[:,best_features[feature]], X_test_poly[:, best_features[feature]]
        limited_model_eval = provide_best_features(X_train_only_best, X_test_only_best, y_train, y_test, num_feat)
        
        print("\n")

    pprint(best_features_to_predict)
    return best_features_to_predict

def run_model(df, degree, num_feat, split_func):
    best_features = []
    best_features_to_predict = {}
    for feature in range(df.shape[1]-2):
        print("FEATURE TO PREDICT: ", feature)
        X_train, X_test, y_train, y_test = split_func(df, feature)
        X_train_poly, X_test_poly, poly_names = linear_to_poly_data(X_train, X_test, degree)
        model_eval = provide_best_features(X_train_poly, X_test_poly, y_train, y_test, num_feat)
        top_features = poly_names[model_eval[0]].tolist()
        best_features.append(model_eval[0])

        print("predict with only TOP ",num_feat," features:")
        X_train_only_best, X_test_only_best = X_train_poly[:,best_features[feature]], X_test_poly[:, best_features[feature]]
        limited_model_eval = provide_best_features(X_train_only_best, X_test_only_best, y_train, y_test, num_feat)
        best_features_to_predict[feature] = {'feature name':df.columns[feature],'predictors names':top_features,'feature numbers':model_eval[0], 'mse':model_eval[1],'accuracy':model_eval[2],'limited model mse': limited_model_eval[1],'limited model accuracy': limited_model_eval[2], 'ratio':limited_model_eval[2]/model_eval[2]}
        print("\n")

    pprint(best_features_to_predict)
    return best_features_to_predict

def fiter_best_predictions(model, thresh):
    filtered_features = {}
    for feature in range(len(model)):
        current = model[feature]
        if current["limited model accuracy"]/current["accuracy"] >= thresh and current["accuracy"]>0.9:
            filtered_features[feature] = current
    return filtered_features





In [82]:
# run the program
degree = 1
thresh = 0.90
num_feat = 2
best_feat = run_model(df,degree, thresh,num_feat, split_data_by_feature)

FEATURE TO PREDICT:  0
Mean Squared Error: 0.0063
Accuracy:  81.82 %
predict only with TOP  2  features:
Mean Squared Error: 0.0214
Accuracy:  50.51 %


FEATURE TO PREDICT:  1
Mean Squared Error: 0.0079
Accuracy:  74.75 %
predict only with TOP  2  features:
Mean Squared Error: 0.0220
Accuracy:  58.59 %


FEATURE TO PREDICT:  2
Mean Squared Error: 0.0042
Accuracy:  89.90 %
predict only with TOP  2  features:
Mean Squared Error: 0.0082
Accuracy:  73.74 %


FEATURE TO PREDICT:  3
Mean Squared Error: 0.0042
Accuracy:  88.89 %
predict only with TOP  2  features:
Mean Squared Error: 0.0084
Accuracy:  73.74 %


FEATURE TO PREDICT:  4
Mean Squared Error: 0.0150
Accuracy:  69.70 %
predict only with TOP  2  features:
Mean Squared Error: 0.0258
Accuracy:  57.58 %


FEATURE TO PREDICT:  5
Mean Squared Error: 0.0100
Accuracy:  71.72 %
predict only with TOP  2  features:
Mean Squared Error: 0.0312
Accuracy:  59.60 %


FEATURE TO PREDICT:  6
Mean Squared Error: 0.0077
Accuracy:  81.82 %
predict only 

In [73]:
# run the program
degree = 1
thresh = 0.90
num_feat = 2
best_feat = run_model(df,degree, thresh,num_feat, split_data_by_feature)

FEATURE TO PREDICT:  0
Mean Squared Error: 0.0063
Accuracy:  81.82 %
predict only with TOP 5 features:
Mean Squared Error: 0.0214
Accuracy:  50.51 %


FEATURE TO PREDICT:  1
Mean Squared Error: 0.0079
Accuracy:  74.75 %
predict only with TOP 5 features:
Mean Squared Error: 0.0220
Accuracy:  58.59 %


FEATURE TO PREDICT:  2
Mean Squared Error: 0.0042
Accuracy:  89.90 %
predict only with TOP 5 features:
Mean Squared Error: 0.0082
Accuracy:  73.74 %


FEATURE TO PREDICT:  3
Mean Squared Error: 0.0042
Accuracy:  88.89 %
predict only with TOP 5 features:
Mean Squared Error: 0.0084
Accuracy:  73.74 %


FEATURE TO PREDICT:  4
Mean Squared Error: 0.0150
Accuracy:  69.70 %
predict only with TOP 5 features:
Mean Squared Error: 0.0258
Accuracy:  57.58 %


FEATURE TO PREDICT:  5
Mean Squared Error: 0.0100
Accuracy:  71.72 %
predict only with TOP 5 features:
Mean Squared Error: 0.0312
Accuracy:  59.60 %


FEATURE TO PREDICT:  6
Mean Squared Error: 0.0077
Accuracy:  81.82 %
predict only with TOP 5 f

In [74]:
len(best_feat)

13

In [75]:
# run the program
degree = 1
thresh = 0.90
num_feat = 2
best_feat = run_model(df,degree, thresh,num_feat, split_data_by_applicant)

FEATURE TO PREDICT:  0
Mean Squared Error: 0.0057
Accuracy:  87.00 %
predict only with TOP 5 features:


Mean Squared Error: 0.0235
Accuracy:  52.00 %


FEATURE TO PREDICT:  1
Mean Squared Error: 0.0102
Accuracy:  63.00 %
predict only with TOP 5 features:
Mean Squared Error: 0.0138
Accuracy:  71.00 %


FEATURE TO PREDICT:  2
Mean Squared Error: 0.0045
Accuracy:  87.00 %
predict only with TOP 5 features:
Mean Squared Error: 0.0073
Accuracy:  81.00 %


FEATURE TO PREDICT:  3
Mean Squared Error: 0.0052
Accuracy:  88.00 %
predict only with TOP 5 features:
Mean Squared Error: 0.0158
Accuracy:  60.00 %


FEATURE TO PREDICT:  4
Mean Squared Error: 0.0125
Accuracy:  64.00 %
predict only with TOP 5 features:
Mean Squared Error: 0.0321
Accuracy:  56.00 %


FEATURE TO PREDICT:  5
Mean Squared Error: 0.0098
Accuracy:  70.00 %
predict only with TOP 5 features:
Mean Squared Error: 0.0339
Accuracy:  60.00 %


FEATURE TO PREDICT:  6
Mean Squared Error: 0.0079
Accuracy:  71.00 %
predict only with TOP 5 features:
Mean Squared Error: 0.0139
Accuracy:  63.00 %


FEATURE TO PREDICT:  7
Mean Squared Error: 0.0

In [76]:
complex_traits = [
        "seller",
        "committed",
        "consistent",
        "persistent",
        "considerate",
        "service-oriented",
        "action-oriented",
        "conscientious"
    ]

In [77]:
len(best_feat)

14