In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
adversity = pd.read_csv('/kaggle/input/predict-dimensions/adversity_with_features.csv', delimiter = ';')
negativity = pd.read_csv('/kaggle/input/predict-dimensions/negativity_with_features.csv', delimiter = ';')
sociality = pd.read_csv('/kaggle/input/predict-dimensions/sociality_with_features.csv', delimiter = ';')
deception = pd.read_csv('/kaggle/input/predict-dimensions/deception_with_features.csv', delimiter = ';')
positivity = pd.read_csv('/kaggle/input/predict-dimensions/positivity_with_features.csv', delimiter = ';')
intellect = pd.read_csv('/kaggle/input/predict-dimensions/intellect_with_features.csv', delimiter = ';')
mating = pd.read_csv('/kaggle/input/predict-dimensions/mating_with_features.csv', delimiter = ';')
duty = pd.read_csv('/kaggle/input/duty-prediction/duty_with_features.csv', delimiter = ';')


In [None]:
#run the script separately for each psychological characteristics

In [3]:
from sklearn.model_selection import train_test_split

# Separate target from predictors

#### Adapt this snipet for each psychological characteristic.
#### E.g. for duty change it to y=duty.Duty X=duty.drop('Duty', axis=1)
y = adversity.Adversity
X = adversity.drop('Adversity', axis=1)


# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 12 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()

X_valid = X_valid_full[my_cols].copy()


In [None]:
features = ['setting', 'frequency', 'initiator', 'help', 'other_gender', 'role',
       'hierarchy', 'geo_distance', 'years_known', 'age_difference',
       'same_gender', 'other_age', 'rel_quality', 'depth_acquaintance',
       'contact_freq', 'shared_interests', 'formality_level']

In [None]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in categorical_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [None]:
from sklearn.impute import SimpleImputer

# Imputation for label
my_imputer = SimpleImputer()
label_imputed_X_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
label_imputed_X_valid = pd.DataFrame(my_imputer.transform(label_X_valid))

# Imputation removed column names; put them back
label_imputed_X_train.columns = label_X_train.columns
label_imputed_X_valid.columns = label_X_valid.columns

In [None]:
X_train = label_imputed_X_train
X_valid = label_imputed_X_valid

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

model = rf_random.best_estimator_
predictions = model.predict(X_valid)


mae = mean_absolute_error(predictions, y_valid)
mse = mean_squared_error(predictions, y_valid, squared=False)

print(mae)
print(mse)
