In [28]:
import mongoengine
import os
import configparser
import requests
import json
import numpy as np
import pandas as pd
from opencage.geocoder import OpenCageGeocode
from datetime import datetime

Configparser implementation

In [8]:
config = configparser.ConfigParser()
config.read('config/meetup.cfg')
os.environ['MEETUP_API_KEY'] = config['MEETUP']['API_KEY']
os.environ['OPENCAGE_KEY']=config['OPENCAGE']['KEY']

geocode = OpenCageGeocode(os.environ['OPENCAGE_KEY'])

In [9]:
class Event:
    
    def __init__(self,
                params,
                url_path):
        self.params = params
        self.url_path = url_path
        
    
    def get_event(self, *args, **kwargs):
        
        r = requests.get(self.url_path, params=self.params)
        request_string = r.text
        
        return request_string

In [10]:
default_args = dict(
    country='United Stated',
    key= os.environ['MEETUP_API_KEY'],
    topic='Python'
)

url_meetup_request = "https://api.meetup.com/2/open_events"
events = Event(params=default_args, url_path=url_meetup_request)
response = events.get_event()

In [11]:
json_response = json.loads(response)
data = json_response['results']
len(data)

198

In [31]:
columns = ['id', 'date', 'year', 'month', 'day', 'country', 'city', 'state,', 'address', 'meetup_name', 'meetup_group_name', 'description', 'event_url', 'yes_rsvp_count', 'status']
id, date, year, month, day, country, city, state, address, meetup_name, meetup_group_name, description, event_url, yes_rsvp_count, status = ([] for i in range(15))

for label in data:
    date_event = datetime.fromtimestamp(label['time'] / 1000.0)
    
    id.append(label['id'])
    date.append(date_event)
    year.append(date_event.year)
    month.append(date_event.month)
    day.append(date_event.year)
    
    if label.get('venue'):
        country.append(label['venue'].get('country'))
        city.append(label['venue'].get('city'))
        state.append(label['venue'].get('state'))
        address.append(label['venue'].get('address_1'))
    else:
        location_json = geocode.reverse_geocode(label['group'].get('group_lat'), label['group'].get('group_lon'))
        country.append(location_json[0]['components'].get('country_code'))
        city.append(location_json[0]['components'].get('city'))
        state.append(location_json[0]['components'].get('state'))
        address.append(location_json[0].get('formatted'))
          
    meetup_name.append(label.get('name'))
    meetup_group_name.append(label['group'].get('name'))
    description.append(label.get('description'))
    event_url.append(label['event_url'])
    yes_rsvp_count.append(label.get('yes_rsvp_count'))
    status.append(label.get('status'))

In [63]:
# results_geocode = geocode.reverse_geocode(label['group']['group_lat'], label['group']['group_lon'])
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.5") \
    .getOrCreate()

schema = T.StructType([
        T.StructField("id", T.StringType(), True),
        T.StructField("date", T.TimestampType(), True),
        T.StructField("year", T.IntegerType(), True),
        T.StructField("month", T.IntegerType(), True),
        T.StructField("day", T.IntegerType(), True),
        T.StructField("country", T.StringType(), True),
        T.StructField("city", T.StringType(), True),
        T.StructField("state", T.StringType(), True),
        T.StructField("address", T.StringType(), True),
        T.StructField("meetup_name", T.StringType(), True),
        T.StructField("meetup_group_name", T.StringType(), True),
        T.StructField("description", T.StringType(), True),
        T.StructField("event_url", T.StringType(), True),
        T.StructField("yes_rsvp_count", T.IntegerType(), True),
        T.StructField("status", T.StringType(), True)  
])

df_pandas = pd.DataFrame(np.transpose([id, date, year, month, day, country, city, state, address, meetup_name, meetup_group_name, description, event_url, yes_rsvp_count, status]), columns=columns)
df = spark.createDataFrame(df_pandas, schema=schema)
df.limit(5).toPandas()

Unnamed: 0,id,date,year,month,day,country,city,state,address,meetup_name,meetup_group_name,description,event_url,yes_rsvp_count,status
0,gqwjvqyzhbmb,2019-05-09 10:30:00,2019,5,2019,us,Fredericksburg,VA,919 Caroline St,Coffee Caravan and chat every Thursday morning...,Fredericksburg Developers Group (FredDev),<p>This is a regular meetup over coffee in Fre...,https://www.meetup.com/FredDev/events/260852903/,5,upcoming
1,zzcwmqyzhbmb,2019-05-09 12:00:00,2019,5,2019,us,Chattanooga,TN,1100 Market Street,Smarter Kubernetes Access Control: A Simpler A...,Chadev aka Chattanooga Developer Lunch,<p>Configuring Kubernetes authorization polici...,https://www.meetup.com/chadevs/events/260950371/,28,upcoming
2,fvjqfpyzhbmb,2019-05-09 15:00:00,2019,5,2019,US,Reno,,1091 S Virginia St,Book Club Lunch at India Kabab and Curry,/dev/reno: The Reno Developers Meetup,<p>We're meeting at India Kabab and Curry to h...,https://www.meetup.com/dev-reno/events/260712452/,7,upcoming
3,260643310,2019-05-09 12:00:00,2019,5,2019,us,Raleigh,NC,310 South Harrington Street,Lunch & Learn - Architecting Scalable & Secure...,RIoT,"<p>Join Gordon Blackwell, Cloud Solution Archi...",https://www.meetup.com/NC-RIoT-Regional-Intern...,40,upcoming
4,mhnfwqyzhbmb,2019-05-09 13:00:00,2019,5,2019,us,New York,NY,853 Broadway,Thursday Hacker Hours - meet on second floor,Hacker Hours,<p>**Please read our policies about the meetup...,https://www.meetup.com/hackerhours/events/2611...,16,upcoming


UDF Functions

In [64]:
from pyspark.sql.functions import udf

# Upper lambda functions
upper_udf = F.udf(lambda x: x.upper())

# Remove tags from description column
@udf
def remove_tags_udf(text):
    import re
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

In [65]:
df = df.withColumn('country', upper_udf(df.country))

In [66]:
df = df.withColumn('description', remove_tags_udf(df.description))

In [67]:
df.limit(5).toPandas()

Unnamed: 0,id,date,year,month,day,country,city,state,address,meetup_name,meetup_group_name,description,event_url,yes_rsvp_count,status
0,gqwjvqyzhbmb,2019-05-09 10:30:00,2019,5,2019,US,Fredericksburg,VA,919 Caroline St,Coffee Caravan and chat every Thursday morning...,Fredericksburg Developers Group (FredDev),This is a regular meetup over coffee in Freder...,https://www.meetup.com/FredDev/events/260852903/,5,upcoming
1,zzcwmqyzhbmb,2019-05-09 12:00:00,2019,5,2019,US,Chattanooga,TN,1100 Market Street,Smarter Kubernetes Access Control: A Simpler A...,Chadev aka Chattanooga Developer Lunch,Configuring Kubernetes authorization policies ...,https://www.meetup.com/chadevs/events/260950371/,28,upcoming
2,fvjqfpyzhbmb,2019-05-09 15:00:00,2019,5,2019,US,Reno,,1091 S Virginia St,Book Club Lunch at India Kabab and Curry,/dev/reno: The Reno Developers Meetup,We're meeting at India Kabab and Curry to have...,https://www.meetup.com/dev-reno/events/260712452/,7,upcoming
3,260643310,2019-05-09 12:00:00,2019,5,2019,US,Raleigh,NC,310 South Harrington Street,Lunch & Learn - Architecting Scalable & Secure...,RIoT,"Join Gordon Blackwell, Cloud Solution Architec...",https://www.meetup.com/NC-RIoT-Regional-Intern...,40,upcoming
4,mhnfwqyzhbmb,2019-05-09 13:00:00,2019,5,2019,US,New York,NY,853 Broadway,Thursday Hacker Hours - meet on second floor,Hacker Hours,**Please read our policies about the meetup bu...,https://www.meetup.com/hackerhours/events/2611...,16,upcoming
