In [None]:
#!pip install meetup.api

In [10]:
import meetup.api
import json 
import requests 
import time 
import codecs
import sys
import io
import re
import pandas as pd
import sqlalchemy as sa
import string

In [53]:
def getCreds(filename,subset,defaults={}):
    '''
    This function helps us connect to a database on hadoop2.  The
    userid password information is stored in a local file encoded
    in json format.  
    Parameters:
    filename is where json encoded userid/password information stored
    subset: dictionary key in json file
    defaults: alternative way to connect to database
    Returns: subdictionary based on subset parameter.
    '''
    try:
        with open(filename,'r') as file:
            D = json.load(file)
            file.close()
            if D[subset]:
                return D[subset]
            else:
                return defaults
    except:
        return defaults

def db_setup(userid,password):
    '''
    This function connects us to a database using our login credentials. 
    Paramters:
    userid - string with our userid
    password - string with our password
    database - string with database name
    Returns:
    SQL engine and SQL connection objects
    '''
    template = 'mysql+mysqlconnector://{}:{}@hadoop2.mathsci.denison.edu/'
    cstring = template.format(userid,password)
    engine = sa.create_engine(cstring)
    connection = engine.connect()
    
    return engine, connection

In [48]:
def scrapeData(protocol, host, resource):
    '''
    This function takes elements of a URL and returns the raw text from
    the webpage.
    Parameters: protocol, host, resource
    Returns: response text
    '''
    template = "{}://{}{}"
    url = template.format(protocol, host, resource)
    resp = requests.get(url)
    if resp.status_code == 200:
        return resp.text
    print("Error")

def extractSectionBetween(txt, start, end):
    '''
    Slices text by substring boundaries rather than index boundaries.
    Parameters: text to slice, start string, end string
    Returns: trimmed string
    '''
    startInd = txt.find(start)
    endInd = txt.find(end)
    return txt[startInd:endInd]

def recreateTable(txt):
    '''
    Takes text containing a table in raw HTML format and
    places desired data in a pandas data frame.
    Parameters: html text
    Returns: pandas data frame
    '''
    pat = re.compile(r"<td><a .*?>(?P<county>.*?)<\/a>\n<\/td>\n<td .*?>(?P<pop>.*?)\n<\/td>")
    countyid = []
    i = 1
    county = []
    state = []
    pop = []
    for m in pat.finditer(txt):
        county.append(m["county"].split(",")[0].strip())
        state.append(m["county"].split(",")[1].strip())
        pop.append(int(m["pop"].replace(",", "")))
        countyid.append(i)
        i += 1
    return pd.DataFrame({"CountyID":countyid, "County":county, "State":state, "Population":pop})

In [49]:
def makeDataFrame(name, uid, state, city, lon, lat):
    uniqueStates = set(state)
    uniqueStates = list(uniqueStates)
    uniqueStates.sort()
    statesMap = {}
    for i in range(len(uniqueStates)):
        statesMap[uniqueStates[i]]=i+1
    for i in range(len(name)):
        for j in name[i]: 
            if (j not in string.printable) or (j in string.punctuation):
                name[i] = 'badName'
                break
        state[i]=statesMap[state[i]]
    df = pd.DataFrame({"name":name, "uid":uid, "state":state, "city":city, "lon":lon, "lat":lat})
    return df, statesMap

def superFetch(groupname, key):
    print("Start fetching...")
    client = meetup.api.Client(key)
    print('request status:',end=' ')
    group = client.GetGroup({'urlname':groupname})
    gid = group.id
    total = group.members
    want = 'id,name,state,city,lon,lat'
    limit = 200
    request = total//limit+1
    offsetValue = 0
    name = []
    uid = []
    state = []
    city = []
    lon = []
    lat = []
    for i in range(request):
        print('progress: ',offsetValue+1,'/',request, sep='', end=' ')
        print('request status:',end=' ')
        try:
            members = client.GetMembers(group_id=gid, only=want,page=limit,offset=offsetValue)
            offsetValue+=1
            rList= members.results
            for j in rList:
                if len(j.keys())==4:
                    name.append(j['name'])
                    uid.append(j['id'])
                    state.append(j['state'])
                    city.append(j['city'])
                else:
                    uid.append(j['id'])
                    try:
                        name.append(j['name'])
                    except KeyError:
                        name.append('badName')
                    try:
                        state.append(j['state'])
                    except KeyError:
                        state.append('Atlantis')
                    try:
                        city.append(j['city'])
                    except:
                        city.append('Atlantis')
                    try:
                        lon.append(j['lon'])
                        lat.append(j['lat'])
                    except:
                        lon.append('null')
                        lat.append('null')
        except json.decoder.JSONDecodeError:
            print("JSONDecodeError")
    print("fetching complete")
    df, stateMap = makeDataFrame(name, uid, state, city, lon, lat)
    return df, stateMap

In [41]:
key = '1a60703a102a6252424875415c1b21'
df, stateMap = superFetch('Data-Science-DC', key)

txt = scrapeData("https", "en.wikipedia.org", "/wiki/Washington_metropolitan_area")
txt = extractSectionBetween(txt, "92.3 years", "The Washington, D.C. area has the largest science")
popTable = recreateTable(txt)
popTable["State"] = popTable["State"].replace("D.C.", stateMap['DC'])
popTable["State"] = popTable["State"].replace("Maryland", stateMap['MD'])
popTable["State"] = popTable["State"].replace("Virginia", stateMap['VA'])
print(popTable)

Start fetching...
request status: 29/30 (10 seconds remaining)
progress: 1/58 request status: 28/30 (10 seconds remaining)
progress: 2/58 request status: 27/30 (8 seconds remaining)
progress: 3/58 request status: 26/30 (7 seconds remaining)
progress: 4/58 request status: 25/30 (6 seconds remaining)
progress: 5/58 request status: 24/30 (4 seconds remaining)
progress: 6/58 request status: 23/30 (3 seconds remaining)
progress: 7/58 request status: 22/30 (1 seconds remaining)
progress: 8/58 request status: 29/30 (10 seconds remaining)
progress: 9/58 request status: 28/30 (8 seconds remaining)
progress: 10/58 request status: 27/30 (7 seconds remaining)
progress: 11/58 request status: 26/30 (5 seconds remaining)
progress: 12/58 request status: 25/30 (3 seconds remaining)
progress: 13/58 request status: 24/30 (2 seconds remaining)
progress: 14/58 request status: 29/30 (10 seconds remaining)
progress: 15/58 request status: 28/30 (8 seconds remaining)
progress: 16/58 request status: 27/30 (6 se

In [55]:
creds = getCreds('creds.json','mysql',defaults = {'user':'studen_j1', 'password':'studen_j1'})
try:
    connection.close()
    del engine
except:
    pass
engine, connection = db_setup(creds['user'],creds['password'])


In [56]:
makeDB = "CREATE DATABASE wang_j2;"

makeCountyPopulations = "CREATE TABLE CountyPopulations(\
CountyID INT NOT NULL,\
CountyName VARCHAR(30),\
StateID INT,\
Population INT,\
PRIMARY KEY(CountyID)\
);"

makeStates = "CREATE TABLE States(\
StateID INT NOT NULL,\
State VARCHAR(30),\
PRIMARY KEY(StateID)\
);"

makeUsers = "CREATE TABLE Users(\
MemberID INT NOT NULL,\
Nickname VARCHAR(30),\
StateID INT,\
UserCity VARCHAR(30),\
UserLongitude DECIMAL(5, 2),\
UserLatitude DECIMAL(5, 2),\
PRIMARY KEY(MemberID)\
);"

In [57]:
# connection.execute("commit;")
# connection.execute(makeDB)
# connection.execute("USE wang_j2;")
# connection.execute(makeCountyPopulations)
# connection.execute(makeStates)
# connection.execute(makeUsers)

<sqlalchemy.engine.result.ResultProxy at 0x171a61227f0>

In [58]:
countyRowTemplate = "INSERT INTO CountyPopulations\
(CountyID, CountyName, StateID, Population)\
VALUES({},{},{},{});"

stateRowTemplate = "INSERT INTO States\
(StateID, State)\
VALUES({},{});"

usersRowTemplate = "INSERT INTO Users\
(MemberID, Nickname, StateID, UserCity, UserLongitude, UserLatitude)\
VALUES({},{},{},{},{},{});"

# for i in range(len(popTable)):
#     countyRow = countyRowTemplate.format(popTable.iloc[i, 0], "'"+popTable.iloc[i, 1].replace("'", "")+"'", popTable.iloc[i, 2], popTable.iloc[i, 3])
#     connection.execute(countyRow)

# StateList = list(stateMap.keys())
# StateIDList = list(stateMap.values())
# for i in range(len(StateList)):
#     stateRow = stateRowTemplate.format(StateIDList[i], "'"+StateList[i]+"'")
#     connection.execute(stateRow)

# for i in range(len(df)):
#     if i % 300 == 0:
#         print(i)
#     usersRow = usersRowTemplate.format(df.iloc[i,1], "'"+df.iloc[i, 0]+"'", df.iloc[i, 2], "'"+df.iloc[i, 3]+"'", df.iloc[i, 4], df.iloc[i, 5])
#     connection.execute(usersRow)


0
300
600
900
1200
1500
1800
2100
2400
2700
3000
3300
3600
3900
4200
4500
4800
5100
5400
5700
6000
6300
6600
6900
7200
7500
7800
8100
8400
8700
9000
9300
9600
9900
10200
10500
10800
11100
11400


In [59]:
try:
    connection.close()
    del engine
    print("Connection closed")
except:
    pass
    print("No connection")

Connection closed
