In [54]:
#V2.0 of the project

#Project

#First we start spark
import pyspark
import os
from pyspark.sql.functions import *

#Load the packages before creating the context
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.4.1 pyspark-shell'

#Initializing the Spark Context
sc = pyspark.SparkContext.getOrCreate()

#Initialize the SQL context
sql = pyspark.SQLContext(sc)

In [55]:
#Lets load a sample of the data 
mydata = sql.read.format("csv").options(delimiter='\t').load("C:/Users/Javier/OneDrive/Umass Dartmouth/Scalable Data Analysis/Project/20151214160000.gkg.csv")

#Everytime a file is uploaded we should update

#Renaming the columns
mydata = mydata.toDF("GKGRECORDID",
          "V2.1 DATE",
          "V2SOURCECOLLECTIONIDENTIFIER",
          "V2SOURCECOMMONNAME",
          "V2DOCUMENTIDENTIFIER",
          "V1COUNTS",
          "V2.1COUNTS",
          "V1THEMES",
          "V2ENHANCEDTHEMES",
          "V1LOCATIONS",
          "V2ENHANCEDLOCATIONS",
          "V1PERSONS",
          "V2ENHANCEDPERSONS",
          "V1ORGANIZATIONS",
          "V2ENHANCEDORGANIZATIONS",
          "V1.5TONE",
          "V2.1ENHANCEDDATES",
          "V2GCAM",
          "V2.1SHARINGIMAGE",
          "V2.1RELATEDIMAGES",
          "V2.1SOCIALIMAGEEMBEDS",
          "V2.1SOCIALVIDEOEMBEDS",
          "V2.1QUOTATIONS",
          "V2.1ALLNAMES",
          "V2.1AMOUNTS",
          "V2.1TRANSLATIONINFO",
          "V2EXTRASXML")

#Number of rows we have
nrows = mydata.count()

print("Number of rows: ",nrows)

#Collect data so we can treat it as a list
mydata = mydata.collect()

#Initializing the location list
loclists = []

#Change the limit to nrows later
for i in range(1,50):
 
    loc = mydata[i]["V2ENHANCEDLOCATIONS"]
        
    #If locations null we should skip this part
    
    if loc==None:
        print("NO LOCATION DATA")
        continue
    
    #If the locations exist, we continue saving the place
    locations = loc.split(";")
    latlist = []
    
    for location in locations:
        locatt = location.split("#")
        lat = locatt[5]
        long = locatt[6]
        latlong = [lat,long]
        latlist.append(latlong)
        #We should save in the third part of the list the number of times same location appeared to set a circle size in the map
    
    
    loclists.append(latlist)
    
print(loclists[1][:])

#Depending on the number of locations for each new we should create a different GEOJSON, point, polygon, etc.

  


Number of rows:  3187
NO LOCATION DATA
NO LOCATION DATA
NO LOCATION DATA
NO LOCATION DATA
NO LOCATION DATA
NO LOCATION DATA
NO LOCATION DATA
[['32', '53'], ['32', '53'], ['32', '53'], ['32', '53'], ['32', '53'], ['32', '53'], ['32', '53'], ['32', '53'], ['32', '53'], ['33', '65'], ['33', '65'], ['32', '53'], ['32', '53'], ['32', '53'], ['48.2', '16.3667'], ['48.2', '16.3667'], ['48.2', '16.3667'], ['45.6333', '5.73333'], ['45.6333', '5.73333'], ['45.6333', '5.73333'], ['45.6333', '5.73333'], ['32', '53'], ['47.3333', '13.3333']]


In [162]:
#Transforming into GEOJSON
from geojson import Point,GeometryCollection,FeatureCollection,Feature
import geojson
import numpy as np
import pandas as pd

#Initialize the geo collection
geo_collection = GeometryCollection()

features = []
holafeatures = []

for latlong in loclists:
    nloc = len(latlong)
    
    #If we only have one location
    if nloc == 1:
        #We extract the latitude and longitude
        lat = latlong[0][0]
        long = latlong[0][1]
        
        #We convert the latitude and longitude from string to float
        #coords = (float(lat), float(long))
        coords = (float(long), float(lat))
        
        #Create a point
        my_point = Point(coords)
        features.append(my_point)
        holafeatures.append(geojson.Feature(geometry = geojson.Point(coords)))
    
    #If we have multiple locations
    if nloc>1:
        for location in latlong:
            lat = latlong[0][0]
            long = latlong[0][1]   
            
            #We convert the latitude and longitude from string to float
            #coords = (float(lat), float(long))
            coords = (float(long), float(lat))
        
            #Create a point
            my_point = Point(coords)
            features.append(my_point)
            holafeatures.append(geojson.Feature(geometry = geojson.Point(coords)))
            
    
#Printing our results
holafeatures = FeatureCollection(holafeatures)
print(holafeatures)

{"features": [{"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature

In [153]:
#Another approach using pandas dataframes
mydf = pd.DataFrame(columns=('lat','lon'))

i = 0

for latlong in loclists:
    nloc = len(latlong)
    
    #If we only have one location
    if nloc == 1:
        #We extract the latitude and longitude
        lat = latlong[0][0]
        long = latlong[0][1]
        
        #We convert the latitude and longitude from string to float
        coords = [float(lat), float(lon)]
        
        mydf.loc[i] = (coords)
        i = i+1
    
    
    #If we have multiple locations
    if nloc>1:
        for location in latlong:
            lat = latlong[0][0]
            long = latlong[0][1]   
            
            #We convert the latitude and longitude from string to float
            coords = [float(lat), float(lon)]
        
            #Create a point
            mydf.loc[i] = coords
            i = i+1

data = df_to_geojson(mydf, properties=[])
print(data)
            

{"features": [{"geometry": {"coordinates": [31.25, 30.05], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Feature"}, {"geometry": {"coordinates": [53.0, 32.0], "type": "Point"}, "properties": {}, "type": "Featu

In [163]:
#We will use MapBoxGL for the visualization

#Mapboxgl
from mapboxgl.viz import *
from mapboxgl.utils import df_to_geojson, df_to_hexbin, create_radius_stops, scale_between
from mapboxgl.colors import create_color_stops

#Token for the visualization
token = 'pk.eyJ1IjoiamFyZWNoYWxkZSIsImEiOiJjajhzeTU5azYwZzByMnFwODJmeHhoenRwIn0.P1WCNDBpGPddUTysHXr_wA'

#visualization
viz = CircleViz(holafeatures, access_token=token, height='1000px', width='1000px')

#Visualization parameters
viz.center = (0,0)
viz.zoom = (5) 
viz.show()