In [None]:
## install dependencies
!pip install fastai==1.0.61
!pip install spacy==2.3.5
!pip install bert-score
# !git clone https://github.com/google-research/bleurt.git
# %cd bleurt
# !pip install .
# %cd ..

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastai==1.0.61
  Downloading fastai-1.0.61-py3-none-any.whl (239 kB)
[K     |████████████████████████████████| 239 kB 31.3 MB/s 
Collecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
Collecting bottleneck
  Downloading Bottleneck-1.3.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (355 kB)
[K     |████████████████████████████████| 355 kB 63.5 MB/s 
Building wheels for collected packages: nvidia-ml-py3
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19191 sha256=4398c7d1aa1584b211b4e54c6d0da9b3357b9ef8727277a85c7bdd07d7eb779b
  Stored in directory: /root/.cache/pip/wheels/df/99/da/c34f202dc8fd1dffd35e0ecf1a7d7f8374ca05fbcbaf974b83
Successfully built nvidia-ml-py3
Installing collected packages: nvidia

In [None]:
## import libraries
from fastai import *
from fastai.text import * 
import pandas as pd
import re
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import bert_score
import copy
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import gensim.downloader as api
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import heapq

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def data_processing(dataset_file_name):
  stop = stopwords.words('english')
  
  # read data into dataframe
  reference_data = pd.read_excel(dataset_file_name)  

  # check and remove if there are null values in the 'Project Ideas' columns
  nan_rows = reference_data[reference_data['Project Ideas'].isnull()]
  if len(nan_rows)!=0:
    reference_data.dropna(inplace=True)
  
  # convert to lower case
  reference_data['Project Ideas'] = reference_data['Project Ideas'].str.lower()
  
  # drop duplicates
  reference_data.drop_duplicates(inplace=True)

  # count and filter ideas with less than 3 words
  reference_data['Project Ideas_len_words'] = reference_data['Project Ideas'].str.count(' ').add(1)
  cleaned_df = pd.DataFrame(reference_data[reference_data['Project Ideas_len_words']>=3]['Project Ideas'])

  # remove punctuations
  # cleaned_df['Project Ideas'].str.replace('[^\w\s]', '') #removes neccessary punctuations too, hence, should not do it

  # create a new column with removing stop words (can be used as an alternative for the model instead of project ideas)
  cleaned_df['Project_Ideas_without_stopwords'] = cleaned_df['Project Ideas'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
  return cleaned_df

In [None]:
def create_train_val(dataframe, save_df=False):
  #validation percent
  valid_pct = 0.3 

  # create a random permutation of the data
  randomized_df = dataframe.iloc[np.random.permutation(len(dataframe))]
  
  # define the cut according to the validation percentage
  cut = int(valid_pct * len(randomized_df)) + 1

  # create train and valid dataframes with the column you want the model to train with
  train_df, valid_df = pd.DataFrame(randomized_df[cut:]['Project Ideas']), pd.DataFrame(randomized_df[:cut]['Project Ideas'])
  # train_df, valid_df = pd.DataFrame(df[cut:]['Project_Ideas_without_stopwords']), pd.DataFrame(df[:cut]['Project_Ideas_without_stopwords'])
  print("Lenght of training df:", len(train_df))
  print("Lenght of validation df:", len(valid_df))
  
  # optionally save train and valid dfs
  if save_df:
    train_df.to_pickle('data/ml_train_df.pkl')
    valid_df.to_pickle('data/ml_valid_df.pkl')
  
  return train_df, valid_df

In [None]:
def train_model(train, valid, column_name='Project Ideas', train_lrs=[0.0003, 0.0003, 0.0003, 0.0003], model_arch=AWD_LSTM, dropout=0.5, weight_decay=0.000009, final_lr=0.0005, epochs=20):
  """
  initializes a language model based on model_arc and trains the model for epochs based on transfer learning

  Args:
    train: training dataframe
    valid: validation dataframe
    column_name: which column should be used by the model to train on
    train_lrs(list): list of lrs to be used in initial phases of training model
    model_arch: pretrained language model to be used for transfer learning (valid values- AWD_LSTM, Transformer, TransformerXL)
      for more info on archs visit "https://fastai1.fast.ai/text.models.html#Transformer"
    dropout: to prevent over-fitting
    weight_decay: to prevent overfitting during final training
    final_lr: lr to be used for final training of the model for epochs
    epochs: number of runs
  
  Returns:
    model: learned model

  """

  # create data bunch loader from dataframes
  data_lm = TextLMDataBunch.from_df('./', train, valid, text_cols=column_name)
  # data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='Project_Ideas_without_stopwords')
  
  # create a learner model based on arch
  model = language_model_learner(data_lm, model_arch, drop_mult=dropout)
  # model.lr_find()
  # model.recorder.plot()

  # fit one cycle
  print("Runing one cycle with lr=", train_lrs[0])
  model.fit_one_cycle(1, train_lrs[0])
  
  # unfreeze the last two layers, train it a little bit more
  print("Runing one cycle after unfreezing last two layers with lr=", train_lrs[1])
  model.freeze_to(-2)
  model.fit_one_cycle(1, train_lrs[1])
  
  # unfreeze the next layer, train it a little bit more
  print("Runing one cycle after unfreezing next layer with lr=", train_lrs[2])
  model.freeze_to(-3)
  model.fit_one_cycle(1, train_lrs[2])
  
  # unfreeze the whole thing, train it a little bit more
  print("Runing one cycle after unfreezing whole thing with lr=", train_lrs[3])
  model.unfreeze()
  model.fit_one_cycle(1, train_lrs[3])

  # final training for epochs
  print("Final training for epochs=", epochs, " and lr=", final_lr)
  model.fit(epochs, lr=final_lr, wd=weight_decay)
  
  # save the encoder
  model.save_encoder('ml_ft_enc')
  return model

In [None]:
def generate(model, number_of_ideas=10):
  """generate new ideas using the trained model"""
  all_ideas = []
  for i in range(number_of_ideas):
      ideas_unclean = model.predict("xxbos xxfld 1", n_words=20, temperature=0.8)
      ideas_cleaned = ideas_unclean.split("xxbos xxfld 1 ")[1].split("xxbos")
      for idea in ideas_cleaned:
        if idea:
          idea = re.sub('[^a-zA-Z0-9]\s+', '', idea.strip())
          if len(re.findall(r'\w+', idea))>1:
            all_ideas.append(idea.strip())
        if len(all_ideas)==number_of_ideas:
          return all_ideas

In [None]:
ref_df = data_processing('final_dataset.xlsx')
# ref_df
train_df, valid_df = create_train_val(ref_df)
# train_df
model = train_model(train_df, valid_df)

Lenght of training df: 2874
Lenght of validation df: 1233


  return np.array(a, dtype=dtype, **kwargs)


Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


Runing one cycle with lr= 0.0003


epoch,train_loss,valid_loss,accuracy,time
0,7.510973,6.889823,0.06369,00:04


Runing one cycle after unfreezing last two layers with lr= 0.0003


epoch,train_loss,valid_loss,accuracy,time
0,7.128461,6.045261,0.102902,00:00


Runing one cycle after unfreezing next layer with lr= 0.0003


epoch,train_loss,valid_loss,accuracy,time
0,6.28219,5.301835,0.227083,00:01


Runing one cycle after unfreezing whole thing with lr= 0.0003


epoch,train_loss,valid_loss,accuracy,time
0,5.694114,4.935066,0.26369,00:01


Final training for epochs= 20  and lr= 0.0005


epoch,train_loss,valid_loss,accuracy,time
0,5.280511,4.698303,0.283184,00:01
1,5.16606,4.585348,0.29256,00:01
2,5.070783,4.456278,0.294048,00:01
3,4.97913,4.351498,0.304018,00:01
4,4.890827,4.286015,0.301488,00:01
5,4.80828,4.229205,0.303869,00:01
6,4.729751,4.159819,0.31689,00:01
7,4.652939,4.128695,0.313765,00:01
8,4.580229,4.099998,0.312202,00:01
9,4.512414,4.061121,0.313914,00:01


In [None]:
generated_ideas = generate(model)
generated_ideas

['of the machine learning to identify objects in explicit videos',
 'projecting calibration of a machine learning algorithms to predict',
 'under time varying economic impact',
 'automatic music generation and classification',
 'a method for world classification',
 'and other risk prediction',
 'question meter deep learning algorithms for predicting the outcome of baseball games',
 'and other neural networks',
 'using personalized architectures to enhance online review learning',
 'heart attack prediction in marketing']

## Metrics

In [None]:
# using cola to judge the grammatical acceptability of a sentence, with the goal of testing their linguistic competence.
# 1 means acceptable and 0 means unacceptable, but its not the best one to use since it will give 1 even if the sentence is grammatically correct but the sentence does not
# make sense in our case; basically giving many false positives in our case
def get_cola_scores(generated_ideas):
  results = {}
  tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-CoLA")
  model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-CoLA")
  for idea in generated_ideas:
    inputs = tokenizer(idea, return_tensors="pt")
    outputs = model(**inputs)
    results[idea] = torch.argmax(outputs.logits.softmax(dim=-1)).item()
  return results

In [None]:
cola_results = get_cola_scores(generated_ideas)
# print(cola_results)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

{'of the machine learning to identify objects in explicit videos': 0, 'projecting calibration of a machine learning algorithms to predict': 0, 'under time varying economic impact': 0, 'automatic music generation and classification': 1, 'a method for world classification': 1, 'and other risk prediction': 0, 'question meter deep learning algorithms for predicting the outcome of baseball games': 1, 'and other neural networks': 1, 'using personalized architectures to enhance online review learning': 1, 'heart attack prediction in marketing': 1}


In [None]:
grammatically_correct_ideas = []
for k, v in cola_results.items():
  if v==1:
    grammatically_correct_ideas.append(k)
grammatically_correct_ideas

['automatic music generation and classification',
 'a method for world classification',
 'question meter deep learning algorithms for predicting the outcome of baseball games',
 'and other neural networks',
 'using personalized architectures to enhance online review learning',
 'heart attack prediction in marketing']

In [None]:
# MAIN IDEA-
# lower the values of R, P and F, more dissimilar the generated idea is from the original ideas, then we take the max of each value -> higher the value means that the generated
# idea is very much similar to atleast one of the og ideas; lower the value means that it is disimilar from the ideas; we set a threshold of 0.39 meaning that if the generated idea
# has a R value lower then thres, means it is a unique idea, else it is not

def get_bert_score(ref_df, generated_ideas, col_name='Project Ideas', threshold=0.39):
  """
    get the bert score for the each generated idea from the reference idead(original corpus of data);
    bert score returns a tuple of tensors denoting R, P and F values, but we are only considering R values in our case;
    if the max_score is <0.39 then the idea is unique else it is not
    PS: this is extremely slow, use it iff you have all the time in the world
  """
  results = {}
  ref_ideas_list = list(ref_df[col_name].values)
  for gen in generated_ideas:
    bert_score_gen = []
    for ref in ref_ideas_list:
      # get only the R(Recall) score for more info visit- "https://github.com/Tiiiger/bert_score"
      bert_score_gen.append(bert_score.score([gen], [ref], model_type='microsoft/deberta-large-mnli', idf=True, lang='en', rescale_with_baseline=True, use_fast_tokenizer=True)[0].item())
    max_bert_score_gen = max(bert_score_gen)
    similar_ref_idea = ref_ideas_list[bert_score_gen.index(max_bert_score_gen)]
    # print("Generated idea:", gen, "most similar to", similar_ref_idea)
    if max_bert_score_gen<threshold:
      results[gen] = 'Unique'
    else:
      results[gen] = 'Not Unique'
  return results

In [None]:
def get_bleu_score(sentence, remaining_sentences):
    bleu2_lst = []
    bleu3_lst = []
    bleu4_lst = []
    smoothing = SmoothingFunction().method4
    for i in remaining_sentences:
        bleu = sentence_bleu([sentence], i, weights = [(1./2., 1./2.),
                                                       (1./3., 1./3., 1./3.),
                                                       (1./4., 1./4., 1./4., 1./4.)])
        bleu2_lst.append(bleu[0])
        bleu3_lst.append(bleu[1])
        bleu4_lst.append(bleu[2])
    return np.asarray(bleu2_lst), np.asarray(bleu3_lst), np.asarray(bleu4_lst)

In [None]:
def calculate_generic_Bleu(training_ideas, generated_ideas, threshold = 0.39):
    result = {}

    for generated_idea in generated_ideas:

        bleu2, bleu3, bleu4 = get_bleu_score(generated_idea, training_ideas)
        
        if(np.any(bleu2 > threshold) or np.any(bleu3 > threshold) or np.any(bleu4 > threshold)):
            result[generated_idea] = "Not Unique enough"
        else:
            result[generated_idea] = "Unique"

    return result

In [None]:
# The lower the value of the self-bleu score, the higher the diversity in the generated text.
def calculate_selfBleu(sentences):
    '''
    sentences - list of sentences generated by NLG system
    '''
    bleu2_scores = []
    bleu3_scores = []
    bleu4_scores = []
    
    for sent in sentences:
        sentences_copy = copy.deepcopy(sentences)
        sentences_copy.remove(sent)
        bleu2, bleu3, bleu4 = get_bleu_score(sent,sentences_copy)
        bleu2_scores.append(bleu2)
        bleu3_scores.append(bleu3)
        bleu4_scores.append(bleu4)
    return np.mean(bleu2_scores), np.mean(bleu3_scores), np.mean(bleu4_scores)

In [None]:
# word_mover_distance
# get the distance of each generated idea wrt the original ideas, take the minimum and apply some threshold on that to determine the dissimilarity 
# (larger the value, more dissimilar it is, smaller the distance value, more similar the generated idea is)

# possible model_to_use values in our case = "word2vec-google-news-300"; "fasttext-wiki-news-subwords-300" and "glove-wiki-gigaword-50"
# more info-"https://github.com/RaRe-Technologies/gensim"
model_to_use="glove-wiki-gigaword-50"
model_glove = api.load(model_to_use)

def get_word_mover_distance(ref_df, generated_ideas, col_name='Project Ideas', threshold=0.39):
  """
    word mover distance measures the distance requried by the candiate word to move in a word-embedded space, larger the distance more dissimilar the new text is from
    the original text
    threshold - score greater than the threshold will be considered unique (should be changed according to the values you get)
  """
  results = {}
  ref_ideas_list = list(ref_df[col_name].values)
  for gen in generated_ideas:
    wmd_values = []
    for ref in ref_ideas_list:
      distance = model_glove.wmdistance(gen, ref)
      heapq.heappush(wmd_values, distance)
      # wmd_values.append(distance)
    min_wmd = wmd_values[0]
    similar_ref_idea = ref_ideas_list[wmd_values.index(min_wmd)]
    # print("Generated idea:", gen, ";most similar to:", similar_ref_idea)
    # print(min_wmd)
    if min_wmd>threshold:
      results[gen] = 'Unique'
    else:
      results[gen] = 'Not Unique enough'
  return results



In [None]:
bleu_dist_results = calculate_generic_Bleu(ref_df, grammatically_correct_ideas)
bleu_dist_results

{'automatic music generation and classification': 'Unique',
 'a method for world classification': 'Unique',
 'question meter deep learning algorithms for predicting the outcome of baseball games': 'Unique',
 'and other neural networks': 'Unique',
 'using personalized architectures to enhance online review learning': 'Unique',
 'heart attack prediction in marketing': 'Unique'}

In [None]:
# bert_score_results = get_bert_score(ref_df, generated_ideas)
self_bleu_score_results = calculate_selfBleu(grammatically_correct_ideas)
print('Self Bleu results using bleu2 =', self_bleu_score_results[0])
print('Self Bleu results using bleu3 =', self_bleu_score_results[1])
print('Self Bleu results using bleu4 =', self_bleu_score_results[2])

Self Bleu results using bleu2 = 0.26852679076366764
Self Bleu results using bleu3 = 0.1429564764332921
Self Bleu results using bleu4 = 0.08619883591481749


In [None]:
word_mover_dist_results = get_word_mover_distance(ref_df, grammatically_correct_ideas)
word_mover_dist_results

{'automatic music generation and classification': 'Unique',
 'a method for world classification': 'Unique',
 'question meter deep learning algorithms for predicting the outcome of baseball games': 'Unique',
 'and other neural networks': 'Unique',
 'using personalized architectures to enhance online review learning': 'Unique',
 'heart attack prediction in marketing': 'Unique'}