# topic-grid

Create a grid that represents the fit between a natural language client question and a research document. The fit can be quantified using any method we choose that is tested and robust. 

In [14]:
# Initialize with basic packages
%run include-2-shared-functions.ipynb

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


  publish_display_data(data, metadata, source, transient=transient, **kwargs)


In [36]:
# Set the number of questions and the number of research documents
NUM_QUESTIONS = 5
NUM_DOCS = 10

In [37]:
# Set up some ranges
dict_ranges = {'low': [0,0.25], 'med': [0.25,0.65], 'high': [0.65, 1]}

In [63]:
def generate_grid(ranges=dict_ranges, num_q=NUM_QUESTIONS, num_docs=NUM_DOCS):
    '''
    Generate a topic grid. Each row of the topic grid is a list of numbers, each number between 0 and 1. 
    Each row stands for a question. The numbers in the row represent the documents -- each number is a measure of the how well the document "fits" the question.
    Fit can be quantified in any way you choose -- when a document fit is high it means the document answers that question well. 
    
    num_q specifies the number of questions -- the number of rows in the grid. 
    num_docs specifies the number of documents -- the number of columns in the grid.
    
    USES constrained_sum_sample_pos in include-2-shared-functions
    
    '''
    # Want each row to be random but not uniformly random. To do this, let's randomize 
    ## how many low, medium, and high values we want in the row. 
    # Get the num of ranges we have in ranges
    num_ranges = len(ranges)
    # For each range get the number of values to generate
    grid_lists = []
    for i in range(num_q):
        nums_in_range = constrained_sum_sample_pos(num_ranges, num_docs)
        partial_rows = []
        for j in range(len(nums_in_range)):
            partial_row = [round(random.uniform(list(ranges.values())[j][0],list(ranges.values())[j][1]), 2) for k in range(nums_in_range[j])]
            partial_rows.append(partial_row)
        row = flatten_list(partial_rows)
        # To keep the order of the low, medium, and high values unpredictable, shuffle the row
        random.shuffle(row)
        grid_lists.append(row)
        
    return grid_lists
    

In [64]:
generate_grid()

[[0.07, 0.22, 0.12, 0.16, 0.23, 0.03, 0.02, 0.2, 0.41, 0.77],
 [0.16, 0.51, 0.31, 0.98, 0.21, 0.91, 0.43, 0.41, 0.39, 0.39],
 [0.65, 0.29, 0.98, 0.44, 0.65, 0.12, 0.6, 0.5, 0.27, 0.57],
 [0.77, 0.13, 0.21, 0.14, 0.65, 0.11, 0.68, 0.8, 0.78, 0.87],
 [0.28, 0.94, 0.43, 0.7, 0.2, 0.25, 0.75, 0.05, 0.97, 0.62]]

In [None]:
# For each row, calculate the percentage of documents that have fit higher than threshold value F. 
## Then plot each question on the x axis and the percentage of documents covering the question on the y axis.

## ROUGH WORK

In [8]:
# Start by generating a random list of reals between 0 and 1
## random.choices works in Python 3.6 and greater
random.choices([0,1], weights=None,cum_weights=None, k=12)

[1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1]

In [59]:
fit_row = [random.random() for i in range(5)]
fit_row

[0.125294475321169,
 0.4115625129619517,
 0.6828747630165201,
 0.14136476335652293,
 0.6256702823151284]

In [60]:
random.shuffle(fit_row)
fit_row

[0.125294475321169,
 0.6828747630165201,
 0.6256702823151284,
 0.14136476335652293,
 0.4115625129619517]

In [22]:
round(random.uniform(0,.25), 2)

0.11

In [51]:
constrained_sum_sample_pos(4, 100)

[18, 29, 32, 21]

In [24]:
len(dict_ranges)

3

In [31]:
list(dict_ranges.values())[0]

[0, 0.25]

In [52]:
flatten_list([[0.22, 0.12, 0.17, 0.07, 0.15, 0.13, 0.16], [0.44], [0.95, 0.83]])

[0.22, 0.12, 0.17, 0.07, 0.15, 0.13, 0.16, 0.44, 0.95, 0.83]