In [10]:
# Revisiting my small exciplex data set, using the k-Nearest Neighbors approach 

import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
import pandas as pd
from sklearn.utils import shuffle

In [24]:
def procData(X, y): 
    n_neighbors = 5
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform')
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    seed = 42
    X, y = shuffle(X, y, random_state=seed)
    scoring = {
               'abs_error': 'neg_mean_absolute_error',
               'squared_error': 'neg_mean_squared_error'}
    scores = cross_validate(knn, X, y, cv=5, scoring=scoring, return_train_score=True)
    return abs(scores['test_abs_error'].mean()), math.sqrt(abs(scores['test_squared_error'].mean()))

In [25]:
# read data into pandas dataframe 
fname = 'wnDimers.dat'
datTable = pd.read_table(fname, header=None, delim_whitespace=True, comment='#')

In [26]:
# First, try out some simple features: complex overlap and mass 
massOverlap = datTable.iloc[:,[3,4]]
emission = datTable.iloc[:,13]

In [28]:
mae, rmse = procData(massOverlap, emission)

  return self.partial_fit(X, y)


In [29]:
print(mae) 

4383.56019


In [30]:
# Okay, that was better than the SVR model! Can we do even better considering other features? 

overlapHomoLumo = datTable.iloc[:,[4,9,10]]

In [31]:
procData(overlapHomoLumo, emission)

  return self.partial_fit(X, y)


(4681.4489980000017, 6187.705482077638)

In [32]:
# No, that doesn't go so well. Just HOMO-LUMO gap? 
homoLumo = datTable.iloc[:,[9,10]]

In [33]:
procData(homoLumo, emission)

(4744.0117720000007, 6074.713804016372)

In [34]:
# Nope! This is kind of weird, because for simple linear regression, HOMO-LUMO gap 
# is the best predictor we have. 

# suppose we throw all of the features at it? 
procData(datTable.iloc[:,1:12], emission)

  return self.partial_fit(X, y)


(4448.2386439999991, 6065.961034182164)