<a href="https://colab.research.google.com/github/katrina906/CS6120-Summarization-Project/blob/main/extractive_summarization_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train Extractive Summarization Model 
1. Loop through possible configurations for each model and train on subset of data
2. Select best configuration for each model (TF-IDF Baseline, TextRank, LSA) for each evaluation metric (F-Measure, Precision, Recall with and without averaging between unigram and bigram metrics)
  - 18 total best configurations (3 models x 6 metrics) 
3. Train best configurations on full data 

Driver of functions in extractive_summarization notebook

In [None]:
%%capture
!pip install rouge-score
!pip install fasttext
!pip install compress-fasttext
!pip install gensim==3.8.3
!pip install import-ipynb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#!unzip "/content/drive/MyDrive/data/glove*.zip"

In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import string
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import networkx as nx
from rouge_score import rouge_scorer
import gensim
import fasttext
from gensim.models import FastText
import compress_fasttext
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords  
import import_ipynb

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# load in functions from extract_summarization notebook
%cd "drive/MyDrive/Colab Notebooks"
from extractive_summarization import *
%cd ..

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks'
/content/drive/My Drive
/content/drive


### Configurations

In [None]:
# All possible configurations for each model
CONFIGURATIONS_BOW = [['textrank'],
                      ['nostop', 'stopwords'],
                      ['no_stemlemma', 'lemma', 'stem'],
                      ['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['cosine', 'hamming', 'jaccard'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']
                      ]          
CONFIGURATIONS_EMBEDDINGS = [['textrank'],
                             ['nostop', 'stopwords'],
                             ['no_stemlemma', 'lemma', 'stem'],
                             ['embedding'],
                             ['glove', 'fasttext'],
                             ['cosine'],
                             ['num_sentences', 'num_words_lt', 'num_words_gt']
                             ]
# no custom text cleaning options: true baseline. Stop word removal unnecessary with tfidf.
CONFIGURATIONS_BASELINE = [['baseline'],
                           ['num_sentences', 'num_words_lt', 'num_words_gt']
                           ]
CONFIGURATIONS_LSA = [['lsa'],
                      ['nostop', 'stopwords'],
                      ['no_stemlemma', 'lemma', 'stem'],
                      ['bow'],
                      ['counts', 'binary'],
                      ['no_normalization', 'tf', 'tfidf'],
                      ['unigram', 'bigram', 'trigram', 'all'],
                      ['num_sentences', 'num_words_lt', 'num_words_gt']]

# cross products of all possible combinations of configurations
model_configurations = {'textrank':list(itertools.product(*CONFIGURATIONS_BOW)) + list(itertools.product(*CONFIGURATIONS_EMBEDDINGS)),
                        'baseline':list(itertools.product(*CONFIGURATIONS_BASELINE)),
                        'lsa':list(itertools.product(*CONFIGURATIONS_LSA))}
                    
# textrank: remove configurations with jaccard or hamming distance that don't use a binary vector representation
model_configurations['textrank'] = [i for i in model_configurations['textrank'] if (('jaccard' not in i) and ('hamming') not in i) or (('binary' in i) and ('no_normalization' in i))]

### Main Function

In [None]:
def main():

  # load data & basic data cleaning
  df = data_setup(n = 10000) 
  # train tf-idf on full dataset
  tfidf, feature_dict = corpus_tfidf(df) 
  # load word2vec and fasttext embeddings
  embeddings = load_embeddings()

  for model in ['baseline', 'textrank', 'lsa']:
    #  train each configuration on a subset of the data and get evaluation metrics (1/10 of 10,000)
    eval_results, _ = train_config_loop(df.head(1000), tfidf, feature_dict, embeddings, stop_words, model_configurations[model], eval_only = True,
                                        save_every_cnt = 50, filename = 'train_config_loop', start_from = 'train_config_loop_' + model)  

    # find best config for each evaluation metric
    best_configs = find_best_configs(eval_results)

    # train full model on the best configurations for each metric
    print('training best models')
    eval_results_dict = {} # for each eval metric, distribution of evaluation metrics 
    model_results_dict = {} # for each eval metric, data with predicted summaries
    seen_configs = {}  # for each eval metric, best configuration 
    seen_metrics = []
    for metric in best_configs.keys(): 
      config = tuple(best_configs[metric].strip('(').strip(')').replace("'", "").split(', '))
      if config not in seen_configs.keys():
        eval_results, model_results = train_config_loop(df, tfidf, feature_dict, embeddings, stop_words, [config], eval_only = False) 
        eval_results_dict[metric] = eval_results[str(config)][metric]
        model_results_dict[metric] = model_results[str(config)]
        seen_configs[config] = metric
      # prevent duplicative retraining: use existing results if best config for prior metric
      else:
        eval_results_dict[metric] = eval_results_dict[seen_configs[config]]
        model_results_dict[metric] = model_results_dict[seen_configs[config]]
      seen_metrics.append(metric)
      # save best models
      # save every iteration overwriting
      # if need to restart, load in dictionaries, go through best_configs.keys() but not in seen_metrics, continue adding to dictionaries
      with open('/content/drive/MyDrive/data/trained_model_' + model + '.pkl', 'wb') as f: 
          pickle.dump([seen_metrics, eval_results_dict, model_results_dict, best_configs], f) 

In [None]:
#with open('/content/drive/MyDrive/data/trained_model_' + 'baseline' + '.pkl', 'rb') as f: 
#  x = pickle.load(f)

In [1]:
main()