<a href="https://colab.research.google.com/github/jgamblin/GPT2-CVE/blob/main/GPT2_CVEs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT2 CVE Description Generation

## Install GPT and download CVE data

In [None]:
%%capture
!pip -qq install gpt2
!pip install -q gpt-2-simple 
!mkdir -p jsondata
%cd jsondata
!rm *.json 
!rm *.zip 
!wget https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{2002..2022}.json.zip 
!unzip -o "*.zip" 

## Load Libraries

In [None]:
from IPython.core.magic import register_cell_magic
from IPython.display import Markdown
import logging
import numpy as np
import pandas as pd
import warnings
import glob
import json
import gpt_2_simple as gpt2
from datetime import datetime

@register_cell_magic
def markdown(line, cell):
    return Markdown(cell.format(**globals()))


logging.getLogger('matplotlib.font_manager').disabled = True
warnings.filterwarnings("ignore")
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)


## Build CVE Description File


In [None]:
row_accumulator = []
for filename in glob.glob('nvdcve-1.1-20*.json'):
    with open(filename, 'r', encoding='utf-8') as f:
        nvd_data = json.load(f)
        for entry in nvd_data['CVE_Items']:
            try:
                description = entry['cve']['description']['description_data'][0]['value']
            except IndexError:
                description = ''
            new_row = { 
                'Description': description
            }
            if not description.startswith('**'): # disputed, rejected and other non issues start with '**'
                row_accumulator.append(new_row)
        nvd = pd.DataFrame(row_accumulator)

nvd.to_csv('cves.txt', index=False)
nvd

## Download and Finetune Model 

In [None]:
gpt2.download_gpt2(model_name="124M")

In [None]:
sess = gpt2.start_tf_sess()

file_name = "cves.txt"

gpt2.finetune(sess,
              dataset=file_name,
              model_name='124M',
              steps=500,
              restore_from='fresh',
              run_name='run1',
              print_every=100,
              sample_every=500,
              save_every=500
              )

## Generate 10 Examples

In [None]:
gpt2.generate(sess,
              length=125,
              temperature=0.80,
              prefix="",
              nsamples=10,
              batch_size=1,
              truncate="."
              )

## Generate File With 100 Examples

In [None]:
gpt2.generate_to_file(sess,
              length=125,
              temperature=0.80,
              prefix="",
              nsamples=100,
              batch_size=1,
              truncate="."
              )