In [4]:
import mongoengine
import os
import configparser
import requests
import json
import numpy as np
import pandas as pd
import pymongo
from opencage.geocoder import OpenCageGeocode
from datetime import datetime

Configparser implementation

In [5]:
config = configparser.ConfigParser()
config.read('config/meetup.cfg')
os.environ['MEETUP_API_KEY'] = config['MEETUP']['API_KEY']
os.environ['OPENCAGE_KEY']=config['OPENCAGE']['KEY']

geocode = OpenCageGeocode(os.environ['OPENCAGE_KEY'])

In [6]:
class Event:
    
    def __init__(self,
                params,
                url_path):
        self.params = params
        self.url_path = url_path
        
    
    def get_event(self, *args, **kwargs):
        
        r = requests.get(self.url_path, params=self.params)
        request_string = r.text
        
        return request_string

In [7]:
default_args = dict(
    country='United Stated',
    key= os.environ['MEETUP_API_KEY'],
    topic='Python'
)

url_meetup_request = "https://api.meetup.com/2/open_events"
events = Event(params=default_args, url_path=url_meetup_request)
response = events.get_event()

In [8]:
json_response = json.loads(response)
data = json_response['results']
len(data)

200

In [9]:
columns = ['id', 'date', 'year', 'month', 'day', 'country', 'city', 'state,', 'address', 'meetup_name', 'meetup_group_name', 'description', 'event_url', 'yes_rsvp_count', 'status']
id, date, year, month, day, country, city, state, address, meetup_name, meetup_group_name, description, event_url, yes_rsvp_count, status = ([] for i in range(15))

for label in data:
    date_event = datetime.fromtimestamp(label['time'] / 1000.0)
    
    id.append(label['id'])
    date.append(date_event)
    year.append(date_event.year)
    month.append(date_event.month)
    day.append(date_event.year)
    
    if label.get('venue'):
        country.append(label['venue'].get('country'))
        city.append(label['venue'].get('city'))
        state.append(label['venue'].get('state'))
        address.append(label['venue'].get('address_1'))
    else:
        location_json = geocode.reverse_geocode(label['group'].get('group_lat'), label['group'].get('group_lon'))
        country.append(location_json[0]['components'].get('country_code'))
        city.append(location_json[0]['components'].get('city'))
        state.append(location_json[0]['components'].get('state'))
        address.append(location_json[0].get('formatted'))
          
    meetup_name.append(label.get('name'))
    meetup_group_name.append(label['group'].get('name'))
    description.append(label.get('description'))
    event_url.append(label['event_url'])
    yes_rsvp_count.append(label.get('yes_rsvp_count'))
    status.append(label.get('status'))

In [31]:
# results_geocode = geocode.reverse_geocode(label['group']['group_lat'], label['group']['group_lon'])
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

mongo_conn_str =  "mongodb://mongoadmin:{}@dev-mongo-shard-00-00-klryn.mongodb.net:27017,dev-mongo-shard-00-01-klryn.mongodb.net:27017,dev-mongo-shard-00-02-klryn.mongodb.net:27017/test?ssl=true&replicaSet=dev-mongo-shard-0&authSource=admin&retryWrites=true".format('Kenny9353')

spark = SparkSession \
    .builder \
    .appName("meetupcollections") \
    .master('local') \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
    .config("spark.mongodb.input.uri", mongo_conn_str) \
    .config("spark.mongodb.output.uri", mongo_conn_str) \
    .getOrCreate()

schema = T.StructType([
        T.StructField("id", T.StringType(), True),
        T.StructField("date", T.TimestampType(), True),
        T.StructField("year", T.IntegerType(), True),
        T.StructField("month", T.IntegerType(), True),
        T.StructField("day", T.IntegerType(), True),
        T.StructField("country", T.StringType(), True),
        T.StructField("city", T.StringType(), True),
        T.StructField("state", T.StringType(), True),
        T.StructField("address", T.StringType(), True),
        T.StructField("meetup_name", T.StringType(), True),
        T.StructField("meetup_group_name", T.StringType(), True),
        T.StructField("description", T.StringType(), True),
        T.StructField("event_url", T.StringType(), True),
        T.StructField("yes_rsvp_count", T.IntegerType(), True),
        T.StructField("status", T.StringType(), True)  
])

df_pandas = pd.DataFrame(np.transpose([id, date, year, month, day, country, city, state, address, meetup_name, meetup_group_name, description, event_url, yes_rsvp_count, status]), columns=columns)
df = spark.createDataFrame(df_pandas, schema=schema)
df.limit(5).toPandas()

Unnamed: 0,id,date,year,month,day,country,city,state,address,meetup_name,meetup_group_name,description,event_url,yes_rsvp_count,status
0,260289533,2019-05-10 12:00:00,2019,5,2019,us,Modesto,CA,4701 Stoddard St,Angular Boot Camp,Valley Software Developers,<p>The goal of this boot camp is to help progr...,https://www.meetup.com/Valley-Software-Develop...,11,upcoming
1,skrjvqyzhbnb,2019-05-10 13:00:00,2019,5,2019,us,Oakland,CA,8000 Edgewater Dr.,Founder Fridays,Robot Developers Group - Bay Area,<p>Meet other Circuit Launch founders (and our...,https://www.meetup.com/SFRobots/events/261061901/,6,upcoming
2,trjwjqyzhbnb,2019-05-10 12:30:00,2019,5,2019,us,Memphis,TN,"6773 Stage Rd, Bartlett, TN 38134",#memtech Coworking Lunch,Memphis Technology User Groups,<p>This is a monthly lunch meetup in the Memph...,https://www.meetup.com/memphis-technology-user...,5,upcoming
3,260920133,2019-05-10 15:15:00,2019,5,2019,us,Seattle,WA,2901 3rd Ave,Partner Power Hour: Charting Non-traditional p...,Code Fellows Seattle Meetup,<p>Come engage in a discussion to explore how ...,https://www.meetup.com/codefellows/events/2609...,5,upcoming
4,261055275,2019-05-10 21:00:00,2019,5,2019,us,Oakland,CA,1721 Broadway,Code Unplugged - CS Concepts for Beginners,Hack & Learn - Beginners (& pros) coding cool ...,<p>Instead of our normal open-ended hacking se...,https://www.meetup.com/Hack-and-Learn/events/2...,11,upcoming


UDF Functions

In [32]:
from pyspark.sql.functions import udf

# Upper lambda functions
upper_udf = F.udf(lambda x: x.upper())

# Remove tags from description column
@udf
def remove_tags_udf(text):
    import re
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

In [33]:
df = df.withColumn('country', upper_udf(df.country))

In [34]:
df = df.withColumn('description', remove_tags_udf(df.description))

In [35]:
df.limit(5).toPandas()

Unnamed: 0,id,date,year,month,day,country,city,state,address,meetup_name,meetup_group_name,description,event_url,yes_rsvp_count,status
0,260289533,2019-05-10 12:00:00,2019,5,2019,US,Modesto,CA,4701 Stoddard St,Angular Boot Camp,Valley Software Developers,The goal of this boot camp is to help programm...,https://www.meetup.com/Valley-Software-Develop...,11,upcoming
1,skrjvqyzhbnb,2019-05-10 13:00:00,2019,5,2019,US,Oakland,CA,8000 Edgewater Dr.,Founder Fridays,Robot Developers Group - Bay Area,Meet other Circuit Launch founders (and our fr...,https://www.meetup.com/SFRobots/events/261061901/,6,upcoming
2,trjwjqyzhbnb,2019-05-10 12:30:00,2019,5,2019,US,Memphis,TN,"6773 Stage Rd, Bartlett, TN 38134",#memtech Coworking Lunch,Memphis Technology User Groups,This is a monthly lunch meetup in the Memphis ...,https://www.meetup.com/memphis-technology-user...,5,upcoming
3,260920133,2019-05-10 15:15:00,2019,5,2019,US,Seattle,WA,2901 3rd Ave,Partner Power Hour: Charting Non-traditional p...,Code Fellows Seattle Meetup,Come engage in a discussion to explore how to ...,https://www.meetup.com/codefellows/events/2609...,5,upcoming
4,261055275,2019-05-10 21:00:00,2019,5,2019,US,Oakland,CA,1721 Broadway,Code Unplugged - CS Concepts for Beginners,Hack & Learn - Beginners (& pros) coding cool ...,Instead of our normal open-ended hacking sessi...,https://www.meetup.com/Hack-and-Learn/events/2...,11,upcoming


In [36]:
df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").option("database", "meetup").option("collection","events").save()

Py4JJavaError: An error occurred while calling o491.save.
: java.lang.ClassNotFoundException: Failed to find data source: com.mongodb.spark.sql.DefaultSource. Please find packages at http://spark.apache.org/third-party-projects.html
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:657)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:244)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: com.mongodb.spark.sql.DefaultSource.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20$$anonfun$apply$12.apply(DataSource.scala:634)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$20.apply(DataSource.scala:634)
	at scala.util.Try.orElse(Try.scala:84)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:634)
	... 12 more
