In [1]:
import os
import pickle
import optuna
import numpy as np
import pandas as pd
from functions import regression, readSet, saveSet, corrMatrix
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pickedBase = list(readSet('./data/Dimensions-Picked-Base.txt'))
groups = pickle.load(open('./data/Dimensions-Correlated-Groups.pickle', 'rb'))
baseGroup = groups['base']
lexicalGroup = groups['lexical']
syntacticGroup = groups['syntactic']

In [18]:
dataset = 'AirlineTweets'
dims = pd.read_csv(f'./data/{dataset}/Dimensions.csv')
prediction = pd.read_csv(f'./data/{dataset}/Word2Vec-Prediction.csv')
predictCorrect = prediction.progress_apply(lambda row: 1 if row['truth'] == row['predicted'] else 0, axis=1)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [12]:
def objective(trial):
    #basePick      = trial.suggest_categorical('basePick', baseGroup)
    lexicalPick   = trial.suggest_categorical('lexicalPick', lexicalGroup)
    syntacticPick = trial.suggest_categorical('syntacticPick', syntacticGroup)
    dimensions = pickedBase + [lexicalPick, syntacticPick]

    res = regression(predictCorrect, dims[dimensions])
    r2 = res.prsquared
    p = pd.Series(res.pvalues).map(lambda r: 1 if r < 0.1 else 0)
    #coefs = p.mul(res.params, axis=0)
    #coefsum = np.sum(np.abs(coefs))
    return r2, np.sum(p)

In [13]:
study = optuna.create_study(directions=['maximize', 'maximize'])

[32m[I 2021-05-03 17:58:10,302][0m A new study created in memory with name: no-name-fe47224a-b626-42be-93f9-5cea9ddcb69e[0m


In [14]:
study.optimize(objective, n_trials=100)

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
[32m[I 2021-05-03 17:58:11,352][0m Trial 0 finished with values: [0.10217180301487538, 13.0] and parameters: {'lexicalPick': 'nrSyllables/nrWords', 'syntacticPick': 'nrLetters/nrSentences'}. [0m
[32m[I 2021-05-03 17:58:11,477][0m Trial 1 finished with values: [0.10520933719332826, 13.0] and parameters: {'lexicalPick': 'nrSyllables/nrWords', 'syntacticPick': 'nrWords/nrSentences'}. [0m
[32m[I 2021-05-03 17:58:11,611][0m Trial 2 finished with values: [0.09830598587137629, 12.0] and parameters: {'lexicalPick': 'nrPolySyllables/nrWords', 'syntacticPick': 'nrLongWords/nrSentences'}. [0m
[32m[I 2021-05-03 17:58:11,783][0m Trial 3 finished with values: [0.10086350983823766, 13.0] and parameters: {'lexicalPick': 'nrLongWords/nrWords', 'syntacticPick': 'nrBiSyllables/nrSentences'}. [0m
[32m[I 2021-05-03 17:58:11,933][0m Trial 4 finished with values: [0.10534903883122981, 12.0] and parameters: {'lexical

In [7]:
pickle.dump(study, open(f'./data/{dataset}/Dimensions-Optuna.pickle', mode='wb'))

In [4]:
#study = pickle.load(open(f'./data/{dataset}/Dimensions-Optuna.pickle', mode='rb'))

In [27]:
trials = study.trials
bests = pd.DataFrame()
bests['params'] = pd.Series([x.params for x in trials])
bests['r2'] = pd.Series([x.values[0] for x in trials])
bests['coefs'] = pd.Series([x.values[1] for x in trials])

In [29]:
bestR2 = bests.iloc[bests['r2'].idxmax()]
print(bestR2['params'])
print(bestR2['r2'])
print(bestR2['coefs'])
bestCoef = bests.iloc[bests['coefs'].idxmax()]
print(bestCoef['params'])
print(bestCoef['r2'])
print(bestCoef['coefs'])

{'lexicalPick': 'nrBiSyllables/nrWords', 'syntacticPick': 'nrMonoSyllables/nrSentences'}
0.10756773753788584
12.0
{'lexicalPick': 'nrLetters/nrWords', 'syntacticPick': 'nrLongWords/nrSentences'}
0.10146419167446685
14.0


In [30]:
coefs = bests[(bests['coefs'] >= bestCoef['coefs'])]
best = coefs.sort_values('r2', ascending=False).iloc[0]
pickedDimensions = pickedBase + list(best['params'].values())
print(best['params'])
print(best['r2'])
print(best['coefs'])

{'lexicalPick': 'nrLetters/nrWords', 'syntacticPick': 'nrWords/nrSentences'}
0.10607852770807502
14.0


In [31]:
saveSet('./data/Dimensions-Picked-Final.txt', pickedDimensions)

In [32]:
fig = optuna.visualization.plot_param_importances(study, target=lambda x: x.values[0], target_name="R2")
fig.show()

In [33]:
fig = optuna.visualization.plot_param_importances(study, target=lambda x: x.values[1], target_name="Coefs")
fig.show()