In [0]:
cd drive/My\ Drive/first_impressions_data

/content/drive/My Drive/first_impressions_data


In [0]:
cd transcriptions/

/content/drive/My Drive/first_impressions_data/transcriptions


# Imports

In [0]:
import pandas as pd
import numpy as np
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.svm import SVC
from statistics import mode
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Loading in the CSV and values


In [0]:
training_df = pd.read_csv('training_df.csv')
validation_df = pd.read_csv('validation_df.csv')
test_df = pd.read_csv('test_df.csv')

In [0]:
training_data = training_df[['transcript', 'interview', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].values
validation_data = validation_df[['transcript', 'interview', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].values
test_data = test_df[['transcript', 'interview', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].values

In [0]:
training_iocean = training_data[:, 1:]
validation_iocean = validation_data[:, 1:]
test_iocean = test_data[:, 1:]

# Cleaning text

In [0]:
del_index = []
for i in range(len(training_data)):
  if not isinstance(training_data[i, 0], str):
    del_index.append(i)

training_data = np.delete(training_data, del_index, axis = 0)
print(len(del_index))

del_index = []
for i in range(len(test_data)):
  if not isinstance(test_data[i, 0], str):
    del_index.append(i)

test_data = np.delete(test_data, del_index, axis = 0)
print(len(del_index))

8
3


In [0]:
print(len(training_data))
print(len(validation_data))
print(len(test_data))

print(training_data[0])
print(validation_data[0])
print(test_data[0])

complete_data = np.concatenate((training_data, validation_data), axis = 0)
complete_data = np.concatenate((complete_data, test_data), axis = 0)

print(complete_data.shape)

5992
2000
1997
['He\'s cutting it and then turn around and see the end result, but I\'m glad he didn\'t do that because I probably would\'ve lost my mind. As it was getting cut, I was just excited. I saw the snippets of hair falling to the floor and I was like, "Yes!"'
 0.5046728971962616 0.4888888888888889 0.6019417475728155
 0.5233644859813085 0.6263736263736264 0.5520833333333334]
["What is your favorite [sport 00:30] and why? And another one is do you ... I'll do the second question afterwards. So sports do I play. I played ... Now in my time, before I did YouTube, before I ended up-"
 0.616822429906542 0.5555555555555556 0.6407766990291262
 0.6448598130841123 0.6153846153846153 0.59375]
["... Going nuts from another room, run in there to check, there's no [inaudible 00:00:37], but it was like the [scissors 00:00:35] aren't there. Now maybe I'm just not sleeping enough that I moved the scissors somewhere, but I swear the-"
 0.6261682242990654 0.8222222222222221 0.6699029126213591
 

In [0]:
all_words = []
documents = []

from nltk.corpus import stopwords
import re

stop_words = list(set(stopwords.words('english')))
stop_words.append('inaudible')
stop_words.append('du')

#  j is adject, r is adverb, and v is verb
#allowed_word_types = ["J","R","V"]
allowed_word_types = ["J", "V","R", "N"]

k = 0

for p in complete_data:
    
    # create a list of tuples where the first element of each tuple is a review
    # the second element is the label
    
    # remove punctuations

    p[0] = p[0].lower()

    cleaned = re.sub(r'[^(a-zA-Z)\s]','', p[0])
    
    # tokenize 
    tokenized = word_tokenize(cleaned)
    
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stop_words]
    
    # parts of speech tagging for each word 
    part_of_speech = nltk.pos_tag(stopped)
    
    # make a list of  all adjectives identified by the allowed word types list above
    for w in part_of_speech:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

# Save top 5000 words as features

In [0]:
BOW = nltk.FreqDist(all_words)
word_features = list(BOW.keys())[:5000]
word_features[0], word_features[-1]

('hes', 'leaker')

In [0]:
save_word_features = open("word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

# Form features

In [0]:
def find_features(document):
    words = word_tokenize(document)
    features = np.zeros(5000)
    for i in range(5000):
        if word_features[i] in words:
          features[i] = 1

    return features

# Creating features for each review
featuresets = [(find_features(rev)) for (rev, i, o, c, e, a, n) in complete_data]

featuresets = np.array(featuresets)
 
print(featuresets.shape)

(9989, 5000)


# Split into sets

In [0]:
training_set = featuresets[:5992]
validation_set = featuresets[5992:7992]
testing_set = featuresets[7992:]
print( 'training_set :', len(training_set), '\ntesting_set :', len(testing_set))

training_set : 5992 
testing_set : 1997


In [0]:
training_labels = complete_data[:5992, 1:7]
validation_labels = complete_data[5992:7992, 1:7]
testing_labels = complete_data[7992:, 1:7]


print(len(training_labels))
print(len(validation_labels))
print(len(testing_labels))

5992
2000
1997


# Shuffle

In [0]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [0]:
training_set, training_labels = unison_shuffled_copies(training_set, training_labels)
validation_set, validation_labels = unison_shuffled_copies(validation_set, validation_labels)
testing_set, testing_labels = unison_shuffled_copies(testing_set, testing_labels)

# Interview

## Fine tuning for Random forest regressor

In [0]:
# Checking for max_depth

best_depth = 3
max_validation_score = 0
for i in range(3, 11):
  reg = RandomForestRegressor(max_depth = i)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("MAX DEPTH " + str(i))
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_depth = i
    max_validation_score = validation_score

MAX DEPTH 3
Training score 0.8802385229661867
Validation score 0.8833631670682779
----------------------------------------------
MAX DEPTH 4
Training score 0.8806510969455896
Validation score 0.8834569781590187
----------------------------------------------
MAX DEPTH 5
Training score 0.8811427010714719
Validation score 0.8834578295574579
----------------------------------------------
MAX DEPTH 6
Training score 0.8816164708594518
Validation score 0.8835505818944033
----------------------------------------------
MAX DEPTH 7
Training score 0.882231587701089
Validation score 0.8835875259675396
----------------------------------------------
MAX DEPTH 8
Training score 0.882621489584021
Validation score 0.883586853635701
----------------------------------------------
MAX DEPTH 9
Training score 0.8833442898803436
Validation score 0.8835670157917902
----------------------------------------------
MAX DEPTH 10
Training score 0.883998235403455
Validation score 0.88374933134061
--------------------

In [0]:
#checking for best criterion
criterion = ['mse', 'mae']
best_criterion = 'mse'
max_validation_score = 0

for c in criterion:
  reg = RandomForestRegressor(max_depth = best_depth, criterion = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("CRITERION: " + c)
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_criterion = c
    max_validation_score = validation_score

CRITERION: mse
Training score 0.8840966724174258
Validation score 0.8837280966987708
----------------------------------------------


KeyboardInterrupt: ignored

## Test score Random Forest Regressor

In [0]:
reg = RandomForestRegressor(max_depth = best_depth, criterion = best_criterion)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 0], validation_labels[:, 0])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 0], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8823983966549148
----------------------------------------------


## Fine tuning SVR

In [0]:
best_kernel = 'rbf'
kernels = ['rbf', 'linear']
max_validation_score = 0
for k in kernels:
  reg = SVR(kernel = k)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_kernel = k
    max_validation_score = validation_score

Training score 0.924658678302989
Validation score 0.8844939174494675
----------------------------------------------
Training score 0.9118289303245998
Validation score 0.8398228863323203
----------------------------------------------


In [0]:
best_C = 1
regu = [0.01, 0.1, 1, 10, 100]
max_validation_score = 0
for c in regu:
  reg = SVR(kernel = best_kernel, C = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_C = c
    max_validation_score = validation_score

Training score 0.8870670416359114
Validation score 0.8861320523167111
----------------------------------------------
Training score 0.9078512899780821
Validation score 0.8865842759031455
----------------------------------------------
Training score 0.924658678302989
Validation score 0.8844939174494675
----------------------------------------------
Training score 0.9247812323523613
Validation score 0.8823787257781182
----------------------------------------------
Training score 0.9247733977047906
Validation score 0.8823669543653669
----------------------------------------------


## Test score SVR

In [0]:
reg = SVR(kernel = best_kernel, C = best_C)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 0], validation_labels[:, 0])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 0], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8862301635280097
----------------------------------------------


# Openness

## Fine tuning for Random forest regressor

In [0]:
# Checking for max_depth

best_depth = 3
max_validation_score = 0
for i in range(3, 11):
  reg = RandomForestRegressor(max_depth = i)
  reg.fit(training_set, training_labels[:, 1])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 1], validation_preds)

  print("MAX DEPTH " + str(i))
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 1], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_depth = i
    max_validation_score = validation_score

MAX DEPTH 3
Training score 0.8838694853590445
Validation score 0.8852397146933799
----------------------------------------------
MAX DEPTH 4
Training score 0.8843574682655615
Validation score 0.8853475261854776
----------------------------------------------
MAX DEPTH 5
Training score 0.8849839242577318
Validation score 0.8854609610086985
----------------------------------------------
MAX DEPTH 6
Training score 0.8854618785001244
Validation score 0.8854774689718037
----------------------------------------------
MAX DEPTH 7
Training score 0.8861351802153844
Validation score 0.8854594997096183
----------------------------------------------
MAX DEPTH 8
Training score 0.8867246805379829
Validation score 0.8856104445028696
----------------------------------------------
MAX DEPTH 9
Training score 0.8874785467618422
Validation score 0.8855439891500844
----------------------------------------------
MAX DEPTH 10
Training score 0.8882648395862135
Validation score 0.8858409280851925
--------------

In [0]:
#checking for best criterion
criterion = ['mse', 'mae']
best_criterion = 'mse'
max_validation_score = 0

for c in criterion:
  reg = RandomForestRegressor(max_depth = best_depth, criterion = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("CRITERION: " + c)
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_criterion = c
    max_validation_score = validation_score

## Test score Random Forest Regressor

In [0]:
reg = RandomForestRegressor(max_depth = best_depth, criterion = best_criterion)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 1], validation_labels[:, 1])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 1], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8850245242242204
----------------------------------------------


## Fine tuning SVR

In [0]:
best_kernel = 'rbf'
kernels = ['rbf', 'linear']
max_validation_score = 0
for k in kernels:
  reg = SVR(kernel = k)
  reg.fit(training_set, training_labels[:, 1])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 1], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 1], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_kernel = k
    max_validation_score = validation_score

Training score 0.9257711409397875
Validation score 0.8841101996112208
----------------------------------------------
Training score 0.9128971308217052
Validation score 0.843294717386444
----------------------------------------------


In [0]:
best_C = 1
regu = [0.01, 0.1, 1, 10, 100]
max_validation_score = 0
for c in regu:
  reg = SVR(kernel = best_kernel, C = c)
  reg.fit(training_set, training_labels[:, 1])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 1], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 1], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_C = c
    max_validation_score = validation_score

Training score 0.8893890498073392
Validation score 0.8873929322384199
----------------------------------------------
Training score 0.9101786921293789
Validation score 0.8869745138392506
----------------------------------------------
Training score 0.9257711409397875
Validation score 0.8841101996112208
----------------------------------------------
Training score 0.9257041995296463
Validation score 0.8825725341141313
----------------------------------------------
Training score 0.9255796583903911
Validation score 0.88187531730268
----------------------------------------------


## Test score SVR

In [0]:
reg = SVR(kernel = best_kernel, C = best_C)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 1], validation_labels[:, 1])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 1], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8873554295999133
----------------------------------------------


# Conc

## Fine tuning for Random forest regressor

In [0]:
# Checking for max_depth

best_depth = 3
max_validation_score = 0
for i in range(3, 11):
  reg = RandomForestRegressor(max_depth = i)
  reg.fit(training_set, training_labels[:, 2])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 2], validation_preds)

  print("MAX DEPTH " + str(i))
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 2], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_depth = i
    max_validation_score = validation_score

MAX DEPTH 3
Training score 0.8751743443103762
Validation score 0.8745248863186674
----------------------------------------------
MAX DEPTH 4
Training score 0.8757346375115849
Validation score 0.8746955205752939
----------------------------------------------
MAX DEPTH 5
Training score 0.8762446673612514
Validation score 0.8746874092362185
----------------------------------------------
MAX DEPTH 6
Training score 0.8767772057227831
Validation score 0.8749083338315637
----------------------------------------------
MAX DEPTH 7
Training score 0.8773137175711131
Validation score 0.8750650484590539
----------------------------------------------
MAX DEPTH 8
Training score 0.8777931149834707
Validation score 0.8749870344429895
----------------------------------------------
MAX DEPTH 9
Training score 0.8784250933442558
Validation score 0.8750860611173932
----------------------------------------------
MAX DEPTH 10
Training score 0.879072976047128
Validation score 0.875041172198895
----------------

In [0]:
#checking for best criterion
criterion = ['mse', 'mae']
best_criterion = 'mse'
max_validation_score = 0

for c in criterion:
  reg = RandomForestRegressor(max_depth = best_depth, criterion = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("CRITERION: " + c)
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_criterion = c
    max_validation_score = validation_score

## Test score Random Forest Regressor

In [0]:
reg = RandomForestRegressor(max_depth = best_depth, criterion = best_criterion)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 2], validation_labels[:, 2])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 2], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8760602901596064
----------------------------------------------


## Fine tuning SVR

In [0]:
best_kernel = 'rbf'
kernels = ['rbf', 'linear']
max_validation_score = 0
for k in kernels:
  reg = SVR(kernel = k)
  reg.fit(training_set, training_labels[:, 2])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 2], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 2], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_kernel = k
    max_validation_score = validation_score

In [0]:
best_C = 1
regu = [0.01, 0.1, 1, 10, 100]
max_validation_score = 0
for c in regu:
  reg = SVR(kernel = best_kernel, C = c)
  reg.fit(training_set, training_labels[:, 2])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 2], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 2], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_C = c
    max_validation_score = validation_score

Training score 0.8818966964918611
Validation score 0.8776189057406053
----------------------------------------------
Training score 0.9056307187670537
Validation score 0.87873235029469
----------------------------------------------
Training score 0.9238430335726666
Validation score 0.8784393214518995
----------------------------------------------
Training score 0.9237523621889471
Validation score 0.8765956074277137
----------------------------------------------
Training score 0.9237523621889463
Validation score 0.8765956074277144
----------------------------------------------


## Test score SVR

In [0]:
reg = SVR(kernel = best_kernel, C = best_C)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 2], validation_labels[:, 2])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 2], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8805763859258383
----------------------------------------------


# Extra

## Fine tuning for Random forest regressor

In [0]:
# Checking for max_depth

best_depth = 3
max_validation_score = 0
for i in range(3, 11):
  reg = RandomForestRegressor(max_depth = i)
  reg.fit(training_set, training_labels[:, 3])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 3], validation_preds)

  print("MAX DEPTH " + str(i))
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 3], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_depth = i
    max_validation_score = validation_score

MAX DEPTH 3
Training score 0.8783010879731794
Validation score 0.8814384764364366
----------------------------------------------
MAX DEPTH 4
Training score 0.879030816481202
Validation score 0.8815508582784013
----------------------------------------------
MAX DEPTH 5
Training score 0.8796327323457737
Validation score 0.8817491745773992
----------------------------------------------
MAX DEPTH 6
Training score 0.8804358384029846
Validation score 0.881814587808113
----------------------------------------------
MAX DEPTH 7
Training score 0.8812833842214898
Validation score 0.8818173391460928
----------------------------------------------
MAX DEPTH 8
Training score 0.8821367172629486
Validation score 0.8820559110368774
----------------------------------------------
MAX DEPTH 9
Training score 0.8830264046048426
Validation score 0.8821417621078999
----------------------------------------------
MAX DEPTH 10
Training score 0.8839042786567791
Validation score 0.8820310078218387
----------------

In [0]:
#checking for best criterion
criterion = ['mse', 'mae']
best_criterion = 'mse'
max_validation_score = 0

for c in criterion:
  reg = RandomForestRegressor(max_depth = best_depth, criterion = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("CRITERION: " + c)
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_criterion = c
    max_validation_score = validation_score

## Test score Random Forest Regressor

In [0]:
reg = RandomForestRegressor(max_depth = best_depth, criterion = best_criterion)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 3], validation_labels[:, 3])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 3], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8805168483170316
----------------------------------------------


## Fine tuning SVR

In [0]:
best_kernel = 'rbf'
kernels = ['rbf', 'linear']
max_validation_score = 0
for k in kernels:
  reg = SVR(kernel = k)
  reg.fit(training_set, training_labels[:, 3])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 3], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 3], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_kernel = k
    max_validation_score = validation_score

In [0]:
best_C = 1
regu = [0.01, 0.1, 1, 10, 100]
max_validation_score = 0
for c in regu:
  reg = SVR(kernel = best_kernel, C = c)
  reg.fit(training_set, training_labels[:, 3])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 3], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 3], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_C = c
    max_validation_score = validation_score

Training score 0.8834759093679211
Validation score 0.8836444201236061
----------------------------------------------
Training score 0.9059247483314015
Validation score 0.8837652506430237
----------------------------------------------
Training score 0.9239878102181113
Validation score 0.8811863043753343
----------------------------------------------
Training score 0.924324712405039
Validation score 0.8792888597360552
----------------------------------------------
Training score 0.9243025959556647
Validation score 0.8792602742780753
----------------------------------------------


## Test score SVR

In [0]:
reg = SVR(kernel = best_kernel, C = best_C)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 3], validation_labels[:, 3])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 3], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8831100070766744
----------------------------------------------


# Agree

## Fine tuning for Random forest regressor

In [0]:
# Checking for max_depth

best_depth = 3
max_validation_score = 0
for i in range(3, 11):
  reg = RandomForestRegressor(max_depth = i)
  reg.fit(training_set, training_labels[:, 4])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 4], validation_preds)

  print("MAX DEPTH " + str(i))
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 4], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_depth = i
    max_validation_score = validation_score

MAX DEPTH 3
Training score 0.8925517204137972
Validation score 0.8991315423899242
----------------------------------------------
MAX DEPTH 4
Training score 0.8928825278234371
Validation score 0.8991407732773586
----------------------------------------------
MAX DEPTH 5
Training score 0.8932549541787956
Validation score 0.8991494543405618
----------------------------------------------
MAX DEPTH 6
Training score 0.8935960255217656
Validation score 0.8991893152449005
----------------------------------------------
MAX DEPTH 7
Training score 0.8939641448279094
Validation score 0.8991139592573967
----------------------------------------------
MAX DEPTH 8
Training score 0.8944602800482364
Validation score 0.8991228978919182
----------------------------------------------
MAX DEPTH 9
Training score 0.8949070128540133
Validation score 0.8990931742302308
----------------------------------------------
MAX DEPTH 10
Training score 0.895156489855345
Validation score 0.8991515945265055
---------------

In [0]:
#checking for best criterion
criterion = ['mse', 'mae']
best_criterion = 'mse'
max_validation_score = 0

for c in criterion:
  reg = RandomForestRegressor(max_depth = best_depth, criterion = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("CRITERION: " + c)
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_criterion = c
    max_validation_score = validation_score

## Test score Random Forest Regressor

In [0]:
reg = RandomForestRegressor(max_depth = best_depth, criterion = best_criterion)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 4], validation_labels[:, 4])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 4], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8940906560917239
----------------------------------------------


## Fine tuning SVR

In [0]:
best_kernel = 'rbf'
kernels = ['rbf', 'linear']
max_validation_score = 0
for k in kernels:
  reg = SVR(kernel = k)
  reg.fit(training_set, training_labels[:, 4])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 4], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 4], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_kernel = k
    max_validation_score = validation_score

In [28]:
best_C = 1
regu = [0.01, 0.1, 1, 10, 100]
max_validation_score = 0
for c in regu:
  reg = SVR(kernel = best_kernel, C = c)
  reg.fit(training_set, training_labels[:, 4])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 4], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 4], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_C = c
    max_validation_score = validation_score

Training score 0.8975713085479131
Validation score 0.9010766507381235
----------------------------------------------
Training score 0.9148860122561876
Validation score 0.9002332279533054
----------------------------------------------
Training score 0.9272940522751374
Validation score 0.8969719743040769
----------------------------------------------
Training score 0.9271861351519504
Validation score 0.8955871932403036
----------------------------------------------
Training score 0.8975713085479131
Validation score 0.9010766507381235
----------------------------------------------
Training score 0.9148860122561876
Validation score 0.9002332279533054
----------------------------------------------
Training score 0.9272940522751374
Validation score 0.8969719743040769
----------------------------------------------
Training score 0.9271861351519504
Validation score 0.8955871932403036
----------------------------------------------
Training score 0.9271743141796546
Validation score 0.89526348580

## Test score SVR

In [29]:
reg = SVR(kernel = best_kernel, C = best_C)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 4], validation_labels[:, 4])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 4], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8969296713459083
----------------------------------------------
Test score 0.8969296713459083
----------------------------------------------


# Neuro

## Fine tuning for Random forest regressor

In [30]:
# Checking for max_depth

best_depth = 3
max_validation_score = 0
for i in range(3, 11):
  reg = RandomForestRegressor(max_depth = i)
  reg.fit(training_set, training_labels[:, 5])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 5], validation_preds)

  print("MAX DEPTH " + str(i))
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 5], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_depth = i
    max_validation_score = validation_score

MAX DEPTH 3
Training score 0.8772249096470793
Validation score 0.8792383661302469
----------------------------------------------
MAX DEPTH 4
Training score 0.8777372378510517
Validation score 0.8794020695215408
----------------------------------------------
MAX DEPTH 5
Training score 0.8783204974267368
Validation score 0.8793888080598596
----------------------------------------------
MAX DEPTH 6
Training score 0.8789481300985269
Validation score 0.8793967419883749
----------------------------------------------
MAX DEPTH 7
Training score 0.8796507854755511
Validation score 0.8794043195649925
----------------------------------------------
MAX DEPTH 8
Training score 0.8802920018863691
Validation score 0.8795289276256125
----------------------------------------------
MAX DEPTH 9
Training score 0.8809874418203436
Validation score 0.8795592411988615
----------------------------------------------
MAX DEPTH 10
Training score 0.8818648816821696
Validation score 0.8795558422447348
--------------

In [0]:
#checking for best criterion
criterion = ['mse', 'mae']
best_criterion = 'mse'
max_validation_score = 0

for c in criterion:
  reg = RandomForestRegressor(max_depth = best_depth, criterion = c)
  reg.fit(training_set, training_labels[:, 0])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 0], validation_preds)

  print("CRITERION: " + c)
  print("Training score " + str(1-mean_absolute_error(training_labels[:, 0], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_criterion = c
    max_validation_score = validation_score

## Test score Random Forest Regressor

In [31]:
reg = RandomForestRegressor(max_depth = best_depth, criterion = best_criterion)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 5], validation_labels[:, 5])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 5], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8785027439178016
----------------------------------------------


## Fine tuning SVR

In [0]:
best_kernel = 'rbf'
kernels = ['rbf', 'linear']
max_validation_score = 0
for k in kernels:
  reg = SVR(kernel = k)
  reg.fit(training_set, training_labels[:, 5])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 5], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 5], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_kernel = k
    max_validation_score = validation_score

In [32]:
best_C = 1
regu = [0.01, 0.1, 1, 10, 100]
max_validation_score = 0
for c in regu:
  reg = SVR(kernel = best_kernel, C = c)
  reg.fit(training_set, training_labels[:, 5])

  training_preds = reg.predict(training_set)
  validation_preds = reg.predict(validation_set)

  validation_score = 1-mean_absolute_error(validation_labels[:, 5], validation_preds)

  print("Training score " + str(1-mean_absolute_error(training_labels[:, 5], training_preds)))
  print("Validation score " + str(validation_score))
  print("----------------------------------------------")

  if(validation_score > max_validation_score):
    best_C = c
    max_validation_score = validation_score

Training score 0.8847502235895642
Validation score 0.8829206930106878
----------------------------------------------
Training score 0.9062611738089204
Validation score 0.8832059981079918
----------------------------------------------
Training score 0.9241835637517477
Validation score 0.8808185501883423
----------------------------------------------
Training score 0.924357385685793
Validation score 0.87837782171519
----------------------------------------------
Training score 0.9242242059128133
Validation score 0.8776862349756704
----------------------------------------------


## Test score SVR

In [33]:
reg = SVR(kernel = best_kernel, C = best_C)
reg.fit(np.concatenate((training_set, validation_set), axis = 0), np.concatenate((training_labels[:, 5], validation_labels[:, 5])))

testing_preds = reg.predict(testing_set)

testing_score = 1-mean_absolute_error(testing_labels[:, 5], testing_preds)

print("Test score " + str(testing_score))
print("----------------------------------------------")

Test score 0.8815757200688404
----------------------------------------------
