## Data Preprocessing for CC-Span Implementation
https://github.com/lukehb/137-SPM

# Input
1. Read input into list of lists
2. Flatten list of lists
3. Create dictionary of unique strings (tokens) and integers
4. Perform mapping of unique strings using this dictionary
5. Write out .spmf file using -1 and -2 delimeters

In [2]:
# read "reviews_samples.txt" into list of lists
# https://stackoverflow.com/questions/18448847/import-txt-file-and-having-each-line-as-a-list
transactions = []
with open('reviews_sample.txt', 'rt') as f:
    for line in f:
        transactions.append(line.strip().split(' '))
print(type(transactions))
print(type(transactions[0]))
print(len(transactions))

<class 'list'>
<class 'list'>
10000


In [11]:
# flatten list of lists
# https://stackoverflow.com/questions/11264684/flatten-list-of-lists
from itertools import chain
flattened_transactions = list(chain.from_iterable(transactions))

In [12]:
len(flattened_transactions)

612730

In [14]:
# map unique strings to integers
# https://stackoverflow.com/questions/43203215/map-unique-strings-to-integers-in-python
d = dict([(y,x+1) for x,y in enumerate(sorted(set(flattened_transactions)))])
d

{'000': 1,
 '0002': 2,
 '007': 3,
 '00am': 4,
 '00pm': 5,
 '00s': 6,
 '00service': 7,
 '0123': 8,
 '01am': 9,
 '02pm': 10,
 '05pm': 11,
 '0830': 12,
 '08311': 13,
 '100': 14,
 '1000': 15,
 '1000xs': 16,
 '100cheeses': 17,
 '100lb': 18,
 '100th': 19,
 '100x': 20,
 '101': 21,
 '1030': 22,
 '104': 23,
 '1045': 24,
 '105': 25,
 '106': 26,
 '107': 27,
 '10am': 28,
 '10cent': 29,
 '10ft': 30,
 '10k': 31,
 '10min': 32,
 '10mins': 33,
 '10p': 34,
 '10pm': 35,
 '10th': 36,
 '10x': 37,
 '10x11': 38,
 '10yr': 39,
 '10yrs': 40,
 '110': 41,
 '1100': 42,
 '1106': 43,
 '113': 44,
 '1130': 45,
 '115': 46,
 '1150': 47,
 '116': 48,
 '11a': 49,
 '11am': 50,
 '11pm': 51,
 '11th': 52,
 '120': 53,
 '1200': 54,
 '120dollars': 55,
 '120ea': 56,
 '120lb': 57,
 '1210': 58,
 '1212': 59,
 '122': 60,
 '1230': 61,
 '125': 62,
 '129': 63,
 '12am': 64,
 '12ish': 65,
 '12oz': 66,
 '12pm': 67,
 '12th': 68,
 '130': 69,
 '131': 70,
 '133': 71,
 '134': 72,
 '135': 73,
 '13th': 74,
 '140': 75,
 '144': 76,
 '145': 77,
 '148

In [16]:
# perform mapping (unique strings to integers), using dictionary
new_transactions = []
# https://stackoverflow.com/questions/7368789/convert-all-strings-in-a-list-to-int
# https://stackoverflow.com/questions/33078554/mapping-dictionary-value-to-list
for line in transactions:
    new_transactions.append(list(map(d.get, line)))

In [19]:
# now we have list of list of ints
print(type(new_transactions))
print(type(new_transactions[0]))
print(type(new_transactions[0][0]))
print(len(new_transactions))

<class 'list'>
<class 'list'>
<class 'int'>
10000


In [34]:
# test
# write to spmf format with -1 in between each item and -2 as ending delimter
trans_test = new_transactions[:100]
for line in trans_test:
    print(" -1 ".join(str(item) for item in line) + " -2")

9325 -1 10138 -1 21272 -1 5994 -1 17187 -1 11265 -1 19778 -1 21931 -1 771 -1 13386 -1 7246 -1 12181 -1 2413 -1 2518 -1 10990 -1 17213 -1 7720 -1 18311 -1 10365 -1 9325 -1 21201 -1 2131 -1 1300 -1 21931 -1 21931 -1 20861 -1 13525 -1 2940 -1 14059 -1 13294 -1 4561 -1 7980 -1 10042 -1 7910 -1 13666 -1 8470 -1 945 -1 18978 -1 16456 -2
6937 -1 7720 -1 19077 -1 5044 -1 17302 -1 12433 -1 11890 -1 11649 -1 20847 -1 18711 -1 8637 -1 14539 -1 18660 -1 20103 -2
21964 -1 14539 -1 11361 -1 5168 -1 13463 -1 21434 -1 18529 -1 959 -1 14598 -1 7249 -1 11733 -1 13525 -1 959 -1 18438 -1 7910 -1 20978 -1 9347 -1 7720 -1 936 -1 5096 -1 18309 -1 9681 -1 4818 -1 15669 -1 8470 -1 1786 -1 18407 -1 14430 -1 20001 -1 21652 -1 14171 -1 1634 -1 6889 -1 21931 -1 17177 -2
7720 -1 8637 -1 2131 -1 19709 -1 21665 -1 21665 -1 17697 -1 7217 -1 21502 -1 3070 -1 2131 -1 14756 -1 936 -1 11265 -1 17143 -1 16765 -1 21665 -1 21665 -1 13026 -1 12554 -1 21427 -1 13026 -1 21587 -1 21665 -1 5713 -1 1300 -1 13002 -1 7198 -1 7937 -1

In [38]:
# for real
with open('spmf_reviews_sample.spmf', 'wt') as f:
    for line in new_transactions:
        f.write(" -1 ".join(str(item) for item in line) + " -2\n")

Now, use CC-Span CLI tool by running:  
`java -jspm-0.0.5-fat.jar`

And then the following CLI command:  
`ccspan -i spmf_reviews_sample.spmf -o patterns.spmf -s 100`

# Output

In [48]:
# Read in patterns.spmf into list of lists
freq_patterns = []
with open('patterns.spmf', 'rt') as f:
    for line in f:
        freq_patterns.append(line.replace('  ', ' ').strip().split(' ')) # use tuple?
print(type(freq_patterns))
print(type(freq_patterns[0]))
print(len(freq_patterns))

<class 'list'>
<class 'list'>
1040


In [49]:
freq_patterns

[['21272', '#SUP:248'],
 ['5994', '#SUP:391'],
 ['17187', '#SUP:286'],
 ['11265', '#SUP:2942'],
 ['21931', '771', '#SUP:170'],
 ['21931', '#SUP:1085'],
 ['771', '#SUP:314'],
 ['13386', '#SUP:660'],
 ['12181', '#SUP:1078'],
 ['10990', '#SUP:592'],
 ['17213', '#SUP:933'],
 ['7720', '8637', '#SUP:163'],
 ['7720', '#SUP:3550'],
 ['7720', '8470', '#SUP:246'],
 ['7720', '17302', '#SUP:118'],
 ['10365', '#SUP:504'],
 ['2131', '#SUP:1721'],
 ['1300', '#SUP:867'],
 ['20861', '#SUP:522'],
 ['13525', '#SUP:1287'],
 ['2940', '#SUP:322'],
 ['13294', '#SUP:131'],
 ['4561', '#SUP:391'],
 ['7980', '#SUP:144'],
 ['10042', '#SUP:212'],
 ['7910', '#SUP:891'],
 ['13666', '#SUP:498'],
 ['8470', '17302', '#SUP:147'],
 ['8470', '#SUP:3868'],
 ['8470', '7720', '#SUP:205'],
 ['8470', '19709', '#SUP:108'],
 ['8470', '14539', '#SUP:109'],
 ['16456', '#SUP:107'],
 ['6937', '#SUP:571'],
 ['5044', '17302', '#SUP:209'],
 ['5044', '#SUP:558'],
 ['17302', '8470', '#SUP:128'],
 ['17302', '#SUP:2443'],
 ['17302', '8637'

In [57]:
freq_patterns[1][1].split(":",1)[1] # support

'391'

In [50]:
# invert our dictionary (it was 1 to 1)
inv_dict = {v: k for k, v in d.items()}

In [51]:
inv_dict

{1: '000',
 2: '0002',
 3: '007',
 4: '00am',
 5: '00pm',
 6: '00s',
 7: '00service',
 8: '0123',
 9: '01am',
 10: '02pm',
 11: '05pm',
 12: '0830',
 13: '08311',
 14: '100',
 15: '1000',
 16: '1000xs',
 17: '100cheeses',
 18: '100lb',
 19: '100th',
 20: '100x',
 21: '101',
 22: '1030',
 23: '104',
 24: '1045',
 25: '105',
 26: '106',
 27: '107',
 28: '10am',
 29: '10cent',
 30: '10ft',
 31: '10k',
 32: '10min',
 33: '10mins',
 34: '10p',
 35: '10pm',
 36: '10th',
 37: '10x',
 38: '10x11',
 39: '10yr',
 40: '10yrs',
 41: '110',
 42: '1100',
 43: '1106',
 44: '113',
 45: '1130',
 46: '115',
 47: '1150',
 48: '116',
 49: '11a',
 50: '11am',
 51: '11pm',
 52: '11th',
 53: '120',
 54: '1200',
 55: '120dollars',
 56: '120ea',
 57: '120lb',
 58: '1210',
 59: '1212',
 60: '122',
 61: '1230',
 62: '125',
 63: '129',
 64: '12am',
 65: '12ish',
 66: '12oz',
 67: '12pm',
 68: '12th',
 69: '130',
 70: '131',
 71: '133',
 72: '134',
 73: '135',
 74: '13th',
 75: '140',
 76: '144',
 77: '145',
 78: 

In [77]:
# final mapping
final_list = []
for pat in freq_patterns:
    list_lines = [] # list for each line
    # https://stackoverflow.com/questions/914715/how-to-loop-through-all-but-the-last-item-of-a-list
    # perform reverse mapping for items up to len(list) - 1
    for i in pat[:-1]:
        # remember to convert to integer and use as key with inverted dictionary
        list_lines.append(inv_dict.get(int(i))) 
    list_lines.append(pat[-1].split(":",1)[1]) # extract support
    final_list.append(list_lines) # add line back to list


        

        
        

In [80]:
final_list

[['walking', '248'],
 ['doe', '391'],
 ['seem', '286'],
 ['like', '2942'],
 ['year', 'ago', '170'],
 ['year', '1085'],
 ['ago', '314'],
 ['old', '660'],
 ['menu', '1078'],
 ['large', '592'],
 ['selection', '933'],
 ['food', 'great', '163'],
 ['food', '3550'],
 ['food', 'good', '246'],
 ['food', 'service', '118'],
 ['italian', '504'],
 ['best', '1721'],
 ['area', '867'],
 ['usually', '522'],
 ['order', '1287'],
 ['burger', '322'],
 ['obviously', '131'],
 ['cooked', '391'],
 ['frozen', '144'],
 ['ingredient', '212'],
 ['fresh', '891'],
 ['overall', '498'],
 ['good', 'service', '147'],
 ['good', '3868'],
 ['good', 'food', '205'],
 ['good', 'thing', '108'],
 ['good', 'place', '109'],
 ['road', '107'],
 ['excellent', '571'],
 ['customer', 'service', '209'],
 ['customer', '558'],
 ['service', 'good', '128'],
 ['service', '2443'],
 ['service', 'great', '135'],
 ['miss', '234'],
 ['machine', '102'],
 ['used', '527'],
 ['still', '976'],
 ['great', 'place', '273'],
 ['great', '3080'],
 ['great',

## Write Output File

In [88]:
for line in final_list:
    print(line[-1] + ":" + ';'.join(map(str, line[:-1])) + "\n")
    

248:walking

391:doe

286:seem

2942:like

170:year;ago

1085:year

314:ago

660:old

1078:menu

592:large

933:selection

163:food;great

3550:food

246:food;good

118:food;service

504:italian

1721:best

867:area

522:usually

1287:order

322:burger

131:obviously

391:cooked

144:frozen

212:ingredient

891:fresh

498:overall

147:good;service

3868:good

205:good;food

108:good;thing

109:good;place

107:road

571:excellent

209:customer;service

558:customer

128:service;good

2443:service

135:service;great

234:miss

102:machine

527:used

976:still

273:great;place

3080:great

153:great;service

151:great;food

104:place;like

4041:place

131:place;get

345:yes

1438:little

112:opened

327:weekend

154:staff;friendly

1282:staff

1475:always

197:pleasant

312:fast

211:make;sure

1481:make

454:spot

264:veggie

1985:also

746:special

220:ice;cream

294:ice

404:cream

274:really;good

2135:really

111:really;nice

143:split

223:topping

1045:ever

818:see

1343:thing

21

In [89]:
with open('patterns.txt', 'wt') as f:
    for line in final_list:
        f.write(line[-1] + ":" + ';'.join(map(str, line[:-1])) + "\n")