dijkstra code source: http://www.bogotobogo.com/python/python_Dijkstras_Shortest_Path_Algorithm.php  
wikipedia data: https://dumps.wikimedia.org/enwiki/latest/

In [3]:
import sys
import os
import gzip
import pandas as pd
from sqlalchemy import create_engine

## SQL Alchemy

In [16]:
cnx = create_engine('postgresql://kpully@localhost:5432/wikilinks')
#cnx.table_names()

In [26]:
df = pd.read_sql_query('''SELECT * FROM pagelinks LIMIT 5''',cnx)

In [27]:
df.head()

Unnamed: 0,pl_from,pl_from_namespace,pl_title,pl_namespace
0,19327051,0,test,0
1,19327051,0,(388188)_2006_DP14\,0
2,19335567,0,(388188)_2006_DP14\,0


In [36]:
#create graph from dataframe
g = Graph()

for i, row in df.iterrows():
    g.add_vertex(row.pl_title)
    g.add_edge(row.pl_title, row.pl_from, 1)

In [37]:
    print 'Graph data:'
    for v in g:
        for w in v.get_connections():
            vid = v.get_id()
            wid = w.get_id()
            print '( %s , %s, %3d)'  % ( vid, wid, v.get_weight(w))

    dijkstra(g, g.get_vertex('test'), g.get_vertex('(388188)_2006_DP14\\')) 

    target = g.get_vertex('(388188)_2006_DP14\\')
    path = [target.get_id()]
    shortest(target, path)
    print 'The shortest path : %s' %(path[::-1])

Graph data:
( test , 19327051,   1)
( 19327051 , (388188)_2006_DP14\,   1)
( 19327051 , test,   1)
( 19335567 , (388188)_2006_DP14\,   1)
( (388188)_2006_DP14\ , 19335567,   1)
Dijkstra's shortest path
The shortest path : [u'(388188)_2006_DP14\\']


## Dijkstra's Shortest Path Algorithm

In [32]:
class Vertex:
    def __init__(self, node):
        self.id = node
        self.adjacent = {}
        # Set distance to infinity for all nodes
        self.distance = sys.maxint
        # Mark all nodes unvisited        
        self.visited = False  
        # Predecessor
        self.previous = None

    def add_neighbor(self, neighbor, weight=0):
        self.adjacent[neighbor] = weight

    def get_connections(self):
        return self.adjacent.keys()  

    def get_id(self):
        return self.id

    def get_weight(self, neighbor):
        return self.adjacent[neighbor]

    def set_distance(self, dist):
        self.distance = dist

    def get_distance(self):
        return self.distance

    def set_previous(self, prev):
        self.previous = prev

    def set_visited(self):
        self.visited = True

    def __str__(self):
        return str(self.id) + ' adjacent: ' + str([x.id for x in self.adjacent])

class Graph:
    def __init__(self):
        self.vert_dict = {}
        self.num_vertices = 0

    def __iter__(self):
        return iter(self.vert_dict.values())

    def add_vertex(self, node):
        self.num_vertices = self.num_vertices + 1
        new_vertex = Vertex(node)
        self.vert_dict[node] = new_vertex
        return new_vertex

    def get_vertex(self, n):
        if n in self.vert_dict:
            return self.vert_dict[n]
        else:
            return None

    def add_edge(self, frm, to, cost = 0):
        if frm not in self.vert_dict:
            self.add_vertex(frm)
        if to not in self.vert_dict:
            self.add_vertex(to)

        self.vert_dict[frm].add_neighbor(self.vert_dict[to], cost)
        self.vert_dict[to].add_neighbor(self.vert_dict[frm], cost)

    def get_vertices(self):
        return self.vert_dict.keys()

    def set_previous(self, current):
        self.previous = current

    def get_previous(self, current):
        return self.previous

def shortest(v, path):
    ''' make shortest path from v.previous'''
    if v.previous:
        path.append(v.previous.get_id())
        shortest(v.previous, path)
    return

import heapq

def dijkstra(aGraph, start, target):
    print '''Dijkstra's shortest path'''
    # Set the distance for the start node to zero 
    start.set_distance(0)

    # Put tuple pair into the priority queue
    unvisited_queue = [(v.get_distance(),v) for v in aGraph]
    heapq.heapify(unvisited_queue)

    while len(unvisited_queue):
        # Pops a vertex with the smallest distance 
        uv = heapq.heappop(unvisited_queue)
        current = uv[1]
        current.set_visited()

        #for next in v.adjacent:
        for next in current.adjacent:
            # if visited, skip
            if next.visited:
                continue
            new_dist = current.get_distance() + current.get_weight(next)
            
            if new_dist < next.get_distance():
                next.set_distance(new_dist)
                next.set_previous(current)

        # Rebuild heap
        # 1. Pop every item
        while len(unvisited_queue):
            heapq.heappop(unvisited_queue)
        # 2. Put all vertices not visited into the queue
        unvisited_queue = [(v.get_distance(),v) for v in aGraph if not v.visited]
        heapq.heapify(unvisited_queue)
    
if __name__ == '__main__':

    g = Graph()

    g.add_vertex('a')
    g.add_vertex('b')
    g.add_vertex('c')
    g.add_vertex('d')
    g.add_vertex('e')
    g.add_vertex('f')

    g.add_edge('a', 'b', 7)  
    g.add_edge('a', 'c', 9)
    g.add_edge('a', 'f', 14)
    g.add_edge('b', 'c', 10)
    g.add_edge('b', 'd', 15)
    g.add_edge('c', 'd', 11)
    g.add_edge('c', 'f', 2)
    g.add_edge('d', 'e', 6)
    g.add_edge('e', 'f', 9)

    print 'Graph data:'
    for v in g:
        for w in v.get_connections():
            vid = v.get_id()
            wid = w.get_id()
            print '( %s , %s, %3d)'  % ( vid, wid, v.get_weight(w))

    dijkstra(g, g.get_vertex('a'), g.get_vertex('e')) 

    target = g.get_vertex('e')
    path = [target.get_id()]
    shortest(target, path)
    print 'The shortest path : %s' %(path[::-1])

Graph data:
( a , b,   7)
( a , c,   9)
( a , f,  14)
( c , b,  10)
( c , a,   9)
( c , f,   2)
( c , d,  11)
( b , c,  10)
( b , a,   7)
( b , d,  15)
( e , f,   9)
( e , d,   6)
( d , b,  15)
( d , c,  11)
( d , e,   6)
( f , c,   2)
( f , a,  14)
( f , e,   9)
Dijkstra's shortest path
The shortest path : ['a', 'c', 'f', 'e']


## Get files from local

In [1]:
# pd.read_table("test_file")

In [55]:
file_path='/Users/kpully/Downloads/enwiki-latest-page.sql.gz'




In [None]:
with gzip.open(file_path,'r') as f:
    for line in f:
        print('got line', line)

('got line', '-- MySQL dump 10.13  Distrib 5.5.47, for debian-linux-gnu (x86_64)\n')
('got line', '--\n')
('got line', '-- Host: 10.64.48.20    Database: enwiki\n')
('got line', '-- ------------------------------------------------------\n')
('got line', '-- Server version\t5.5.5-10.0.29-MariaDB\n')
('got line', '\n')
('got line', '/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;\n')
('got line', '/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;\n')
('got line', '/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;\n')
('got line', '/*!40101 SET NAMES utf8 */;\n')
('got line', '/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;\n')
('got line', "/*!40103 SET TIME_ZONE='+00:00' */;\n")
('got line', '/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;\n')
('got line', '/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;\n')
('got line', "/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE

In [2]:
# mylist = []

# for chunk in  pd.read_table(file_path, compression='gzip',sep='\x01'):
#     mylist.append(chunk)

# big_data = pd.concat(mylist, axis= 0)
# del mylist

In [57]:
df = pd.read_table(file_path,compression='gzip',sep='\x01')

CParserError: Error tokenizing data. C error: out of memory

## Wikipedia data / AWS

In [39]:
import boto3

In [43]:
# 

In [49]:
s3 = boto3.client('s3')

In [None]:
# s3.get_object(Bucket='wiki-page-links')

In [None]:
# for bucket in s3.buckets.all():
#     print(bucket.name)