In [1]:
import sys
import os

In [2]:
import datetime

In [3]:
import numpy
import scipy
import pandas

In [4]:
sys.path.append(os.path.join(os.path.abspath(os.path.join('../..')), 'src'))

In [5]:
import utils
import mysql_utils
import events_merge

In [6]:
from importlib import reload

## Process Outline

1. Get List of Dates of interest
2. For each (start, stop) pair:
  1. Query feed entries from DB corresponding to range
  2. Generate Graph / Network for time slice:
    1. Find stories about same event; give maximal edge weight
    2. Calculate similarities between all stories; set edges with weights proprotional to similarity
3. Merge slices to create a single network
  1. Give idential stories in adjacent slices an edge with maximal weight

## Create Sequence of dates of interest

In [7]:
base = datetime.datetime.strptime('2017-02-01 00:00:00', '%Y-%m-%d %H:%M:%S')

In [8]:
date_list = [base + datetime.timedelta(hours=x) for x in range(0, 24*7, 6)]

In [86]:
date_list[:4]

[datetime.datetime(2017, 2, 1, 0, 0),
 datetime.datetime(2017, 2, 1, 6, 0),
 datetime.datetime(2017, 2, 1, 12, 0),
 datetime.datetime(2017, 2, 1, 18, 0)]

## Example on 1 Slice

### 01: Query & Clean Data

In [91]:
cnx = mysql_utils.getCnx()

In [92]:
cur = mysql_utils.getCur(cnx)

In [12]:
i = 0
docs = mysql_utils.query_docs_by_datetime(cursor=cur, 
                                          start_dt=date_list[i], 
                                          end_dt=date_list[i + 4])

In [13]:
# Filter out duplicates?
unique_entries = []
titles = set()
for i in docs.index:
    if docs.ix[i].title not in titles:
        unique_entries.append(i)
        titles.update([docs.ix[i].title])

docs = docs.ix[unique_entries]
docs.index = range(docs.shape[0])

In [14]:
docs.shape

(285, 5)

### 02: Calculate Title, Summary Similarities

In [15]:
docid_t, title_scores = events_merge.get_doc_featurevecs(docs, features=['title'])

In [16]:
docid_s, summary_scores = events_merge.get_doc_featurevecs(docs)

In [17]:
summary_scores.shape

(285, 285)

### 03: Build Network Slice

In [18]:
pandas.Series(title_scores.flatten()).describe()

count    81225.000000
mean         0.003731
std          0.026811
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
dtype: float64

In [19]:
pandas.Series(summary_scores.flatten()).describe()

count    81225.000000
mean         0.014200
std          0.033709
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
dtype: float64

In [20]:
title_cutoff = 0.5
summary_cutoff = 0.1

In [21]:
# Find where Score is greater than threshold cutoff
hits_title = numpy.where(title_scores > title_cutoff)
hits_summary = numpy.where(summary_scores > summary_cutoff)

In [22]:
ij = (numpy.hstack([hits_title[0], hits_summary[0]]),
      numpy.hstack([hits_title[1], hits_summary[1]]))

In [23]:
data = numpy.ones((len(ij[0]),))

In [24]:
time_slice = scipy.sparse.coo_matrix((data, ij), shape=summary_scores.shape)

In [26]:
time_slice.sum() / time_slice.shape[0] ** 2

0.029818405663281009

## Splice Multiple Slices

### 00: Query and Prep Data

In [29]:
sc = 0.25

In [50]:
reload(events_merge)

<module 'events_merge' from '/home/immersinn/gits/rssfeed_link_collector/src/events_merge.py'>

In [51]:
out = events_merge.process_timeslice_v2(mysql_utils.query_docs_by_datetime(cursor=cur,
                                                                           start_dt=date_list[0],
                                                                           end_dt=date_list[0 + 4]))

In [52]:
out.keys()

dict_keys(['title', 'summary'])

In [53]:
out['summary'].keys()

dict_keys(['tslice', 'doc_ids'])

In [46]:
details ={'summary' : {'features' : ['title', 'summary'],
                       'cutoff' : 0.1,
                       'to_binary' : False,
                       'make_symmetric' : True, 'sym_func' : lambda x,y : (x+y)/2}
         }

In [54]:
out_01 = events_merge.process_timeslice_v2(mysql_utils.query_docs_by_datetime(cursor=cur,
                                                                              start_dt=date_list[0],
                                                                              end_dt=date_list[0 + 4]),
                                           details=details)
docids_01 = out_01['summary']['doc_ids']
tslice_01 = out_01['summary']['tslice']

In [55]:
out_02 = events_merge.process_timeslice_v2(mysql_utils.query_docs_by_datetime(cursor=cur,
                                                                              start_dt=date_list[1],
                                                                              end_dt=date_list[1 + 4]),
                                           details=details)
docids_02 = out_02['summary']['doc_ids']
tslice_02 = out_02['summary']['tslice']

In [56]:
out_03 = events_merge.process_timeslice_v2(mysql_utils.query_docs_by_datetime(cursor=cur,
                                                                              start_dt=date_list[2],
                                                                              end_dt=date_list[2 + 4]),
                                           details=details)
docids_03 = out_03['summary']['doc_ids']
tslice_03 = out_03['summary']['tslice']

In [57]:
docids = {0 : docids_01,
          1 : docids_02,
          2 : docids_03}
tslices = {0 : tslice_01,
           1 : tslice_02,
           2 : tslice_03}

*We're going to assume that the IDs are numeric since that's what we will be moving to...*

In [66]:
class DocIDMapper():
    
    def __init__(self,):
        self.uids = set()
    
    def __len__(self):
        return(len(self.uids))
    
    def _update_ids(self, docids):
        self.uids.update(set(docids))
    
    def fit(self, docids):
        if type(docids[0]) in [tuple, list]:
            for dids in docids:
                self._update_ids(dids)
        else:
            self_update_ids(docids)
            
        self.lookup = {v : i for i,v in enumerate(self.uids)}
        self.revlu = {i : v for i,v in enumerate(self.uids)}
        
    def transform(self, docids):
        out = []
        for did in docids:
            try:
                out.append(self.lookup[did])
            except KeyError:
                out.append(None)
        return(out)

In [67]:
idmapper = DocIDMapper()

In [68]:
idmapper.fit([docids_01, docids_02, docids_03])

In [69]:
docids = {0 : idmapper.transform(docids_01),
          1 : idmapper.transform(docids_02),
          2 : idmapper.transform(docids_03)
         }

### 01: Find Matching Entries

In [58]:
connected_pairs = [(0,1), (1,2)]

In [59]:
s2smap = {}
for pair in connected_pairs:
    temp = []
    for i,did in enumerate(docids[pair[1]]):
        try:
            temp.append((docids[pair[0]].index(did), i))
        except ValueError:
            pass
    s2smap['-'.join([str(p) for p in pair])] = temp

In [60]:
s2smap['0-1'][:5]

[(0, 0), (1, 1), (2, 2), (3, 3), (4, 5)]

In [61]:
s2smap['1-2'][-5:]

[(285, 308), (286, 309), (287, 310), (288, 312), (289, 313)]

### 02: Create Big Graph

In [79]:
newi = []
newj = []
newdata = []
for k,ts in tslices.items():
    newi.extend([docids[k][ent] for ent in ts['ij'][0]])
    newj.extend([docids[k][ent] for ent in ts['ij'][1]])
    newdata.extend(ts['vals'])

In [82]:
bg = scipy.sparse.coo_matrix((newdata, (newi, newj)), shape=(len(idmapper), len(idmapper)))

In [83]:
bg.shape

(397, 397)

In [85]:
bg.sum()

5570.45537763218

In [None]:
bg_cci

In [None]:
groups, counts = test_spectral.spectralGraphPartition23(bg.tocsr(), Bin='bNG', finetune=False)

In [None]:
counts

In [None]:
groups

In [None]:
numpy.unique(grps)