In [1]:
import os
import pickle
import optuna
import numpy as np
import pandas as pd
from functions import regression, readSet, saveSet
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pickedBase = list(readSet('./data/Dimensions-Picked-Base.txt'))
groups = pickle.load(open('./data/Dimensions-Correlated-Groups.pickle', 'rb'))
baseGroup = groups['base']
lexicalGroup = groups['lexical']
syntacticGroup = groups['syntactic']

In [3]:
dataset = 'AirlineTweets'
dims = pd.read_csv(f'./data/{dataset}/Dimensions.csv')
prediction = pd.read_csv(f'./data/{dataset}/Word2Vec-Prediction.csv')
predictCorrect = prediction.progress_apply(lambda row: 1 if row['truth'] == row['predicted'] else 0, axis=1)

  0%|          | 0/11541 [00:00<?, ?it/s]

In [4]:
def objective(trial):
    basePick      = trial.suggest_categorical('basePick', baseGroup)
    lexicalPick   = trial.suggest_categorical('lexicalPick', lexicalGroup)
    syntacticPick = trial.suggest_categorical('syntacticPick', syntacticGroup)
    dimensions = pickedBase + [basePick, lexicalPick, syntacticPick]

    res = regression(predictCorrect, dims[dimensions])
    r2 = res.prsquared
    p = pd.Series(res.pvalues).map(lambda r: 1 if r < 0.1 else 0)
    #coefs = p.mul(res.params, axis=0)
    #coefsum = np.sum(np.abs(coefs))
    return r2, np.sum(p)

In [5]:
study = optuna.create_study(directions=['maximize', 'maximize'])

[32m[I 2021-04-29 21:44:50,294][0m A new study created in memory with name: no-name-baa67e57-ea15-47b6-bbdc-4cbdd1e23c61[0m


In [6]:
study.optimize(objective, n_trials=100)

  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
[32m[I 2021-04-29 21:44:56,553][0m Trial 38 finished with values: [0.10937155560474687, 16.0] and parameters: {'basePick': 'nrLongWords', 'lexicalPick': 'nrLongWords/nrWords', 'syntacticPick': 'nrLetters/nrSentences'}. [0m
[32m[I 2021-04-29 21:44:56,695][0m Trial 39 finished with values: [0.11640954636243717, 12.0] and parameters: {'basePick': 'nrMonoSyllables', 'lexicalPick': 'nrMonoSyllables/nrWords', 'syntacticPick': 'nrSyllables/nrSentences'}. [0m
[32m[I 2021-04-29 21:44:56,849][0m Trial 40 finished with values: [0.1142225121817777, 13.0] and parameters: {'basePick': 'nrSyllables', 'lexicalPick': 'nrPolySyllables/nrWords', 'syntacticPick': 'nrMonoSyllables/nrSentences'}. [0m
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
[32m[I 2021-04-29 21:44:57,000][0m Trial 41 finished with values: [0.1055619616225274, 15.0] and parameters: {'basePick': 'nrBiSyllables', 'lexicalPick': 'nrMonoSyllab

In [7]:
pickle.dump(study, open(f'./data/{dataset}/Dimensions-Optuna.pickle', mode='wb'))

In [8]:
trials = study.best_trials
bests = pd.DataFrame()
bests['params'] = pd.Series([x.params for x in trials])
bests['r2'] = pd.Series([x.values[0] for x in trials])
bests['coefs'] = pd.Series([x.values[1] for x in trials])

In [9]:
bestR2 = bests.iloc[bests['r2'].idxmax()]
print(bestR2['params'])
print(bestR2['r2'])
print(bestR2['coefs'])
bestCoefs = bests.iloc[bests['coefs'].idxmax()]
print(bestCoefs['params'])
print(bestCoefs['r2'])
print(bestCoefs['coefs'])

b2 = bests[bests['r2'] > 0.14]

{'basePick': 'nrMonoSyllables', 'lexicalPick': 'nrLongWords/nrWords', 'syntacticPick': 'nrWords/nrSentences'}
0.11844354409885649
12.0
{'basePick': 'nrLongWords', 'lexicalPick': 'nrLetters/nrWords', 'syntacticPick': 'nrMonoSyllables/nrSentences'}
0.11243723469127231
16.0


In [10]:
pickedDimensions = pickedBase + list(bestCoefs['params'].values())

In [11]:
saveSet('./data/Dimensions-Picked-Final.txt', pickedDimensions)

In [12]:
fig = optuna.visualization.plot_param_importances(study, target=lambda x: x.values[0], target_name="R2")
fig.show()

In [13]:
fig = optuna.visualization.plot_param_importances(study, target=lambda x: x.values[1], target_name="Coefs")
fig.show()