In [None]:
rawdatacsv="raw-word-data.csv"
shuffledcsv="shuffled_out_new.csv"
shuffledxls="shuffled_out_new.xls"
shuffledxls_pair_merge="shuffled_out_new_pair_merge.xls"  # <---- new data set to use

In [None]:
#instead of going through all n words and then shuffling - which results in n cards. 
# Rather treat the words as a bag of words. Once word is removed it is removed 

import csv
import random
import time
import pandas
import numpy as np
df = pandas.read_csv(rawdatacsv)
uniq = df.Concepts.unique()

np.random.shuffle(uniq)


In [None]:
# is uniq divisible by the card size (4). if not pad with randoms from the first


In [None]:
def random_unique(data, duplicates=None):
    """ Returns a random string from data that is not in the list of duplicates
    
    Parameters
    ----------
    data : np.array or list
        data is the list of data to pull from
    duplicates : list, optional 
        duplicates is what the random output must not match
        
    Returns
    -------
    choice
        a string
    
    Edgecase? horrible case where duplicated contains more data than data and this will endlessly loop - I shoudn't hit that edge case
    """
    choice=random.choice(data)
    if duplicates:
        if choice in duplicates:
            choice=random_unique(data,duplicates)
    return choice

In [None]:
def pad_data(data, card_size):
    """ Based on the card size will pad the input data so that all rows are full. 
    e.g. 10 words and 4 words per card means 2 words padding and 12 words returned.
    if mod is zero we're good
    else if mod 1 pick a random  and check not in last slice. add it. 
    
    Parameters
    ----------
    data : np.array or list
        data is the list of data to pull from
    card_size : int 
        how many words per card.
        
    Returns
    -------
    padded_data
        a np.array or list with original data + random data needs to pad
    
    """
    
    mod = len(data)%card_size
    if mod == 0:
        return data
    else:
        # pick n random not in the last part of the data
        dataslice = data[0:len(data)-mod-1]
        new_data=[]
        for i in range(0,card_size-mod):
            new_data.append(random_unique(dataslice, new_data))
        padded_data = np.append(data,new_data)
        return padded_data
    

In [None]:
# shuffle, pad, repeat by until max, drop dups and return
def generate_cards(data,card_size,max_loop, column_names=['word1', 'word2', 'word3', 'word4']):
    """ Generate a set of data for creating cards in MS Word mail merge.
    
    Parameters
    ----------
    data : np.array or list
        data is the list of data to pull from
    card_size : int 
        how many words per card.
    max_loop : int
        how many times to loop over the data sets and create more random word combinations
    column_names : list
        the names of columns - these are column heading for the dataframe and needed later for mailmerge
        
    Returns
    -------
    dataframe
        a dataframe with card_size columns of shuffled words labelled as per column_names. dup rows removed.
    
    """
    list_of_datasets = []
    np.random.shuffle(data)
    data_set = pad_data(data,card_size)
    for i in range(0,max_loop):
        random.seed(random.randint(0,1000000000)) # get a new random seed for each shuffle
        np.random.shuffle(data) # shuffle data each time to create a new view of the dataset
        list_of_datasets.append(pad_data(data,card_size))
    data_set = np.concatenate(list_of_datasets)
    reshapedata=data_set.reshape((int(len(data_set)/card_size)), card_size)
    df = pandas.DataFrame(reshapedata, columns=column_names)
    return df.drop_duplicates()


In [None]:
fdf = generate_cards(uniq,4,4)
fdf = fdf.rename_axis('index')
fdf[0:100].to_excel(shuffledxls)
fdf

In [None]:
# generate as if 2 words per card and then combine, ensures words are found in both the discussion and drawing parts of the card
one_df = generate_cards(uniq,2,2,column_names=['word1', 'word2'])
two_df = generate_cards(uniq,2,2,column_names=['word3', 'word4'])

merged_df = one_df.merge(two_df, left_index=True, right_index=True)

merged_df = merged_df.rename_axis('index')
merged_df[0:100].to_excel(shuffledxls_pair_merge)
#len(pad_data(uniq,4))


In [None]:
print(merged_df[0:100])

In [None]:
    
# noting a poor performance issue - where some words are repeated too many times. Actually most words are seen 4 times. Some are seen only 3 and some seen 5 times. 
# can create better stats but this is OK for now. 
# the main issue is words that appear 4 times in the same spot and only in drawing or only discussing.
# using a different random seed seems to help but still the issue.

# consider running this as if on 2 cards and then combine. 

In [None]:
# check stats when using a 2 card merge
firsthundred = merged_df[0:100]
for key in uniq:
    w1=list(firsthundred.word1).count(key)
    w2=list(firsthundred.word2).count(key)
    w3=list(firsthundred.word3).count(key)
    w4=list(firsthundred.word4).count(key)
    print(key,w1,w2,w3,w4,w1+w2+w3+w4)

In [None]:
#check stats when using 4 words per card
firsthundred = fdf[0:100]
for key in uniq:
    w1=list(firsthundred.word1).count(key)
    w2=list(firsthundred.word2).count(key)
    w3=list(firsthundred.word3).count(key)
    w4=list(firsthundred.word4).count(key)
    print(key,w1,w2,w3,w4,w1+w2+w3+w4)
    

