# Matching DC population data with users of the Meetup app

## Introduction






Introduction
- context
- - distribution of population in DC area
- - what is the meetup app?
- thesis
- - question: is meetup data representative of underlying population? (null hypothesis: yes)
Methods
- explain data source(s)
- - API
- - web scrape from wiki
- database
- - table design (cols, why three?)
- queries
- - annotate queries, what purpose does each serve?
- made data frames for vis
Analysis
- initial impression of data
- problems discovered
- graphs
- chisq test?
Results
- interpret statistical tests
- draw conclusions about DC inhabitants
Future Thoughts
- other cities?
- meetups hosted at tech companies (impact)
- locality > interest ?

## Methods

In [1]:
import meetup.api
import json 
import requests 
import time 
import codecs
import sys
import io
import re
import pandas as pd
import sqlalchemy as sa
import string

In [5]:
def getCreds(filename,subset,defaults={}):
    '''
    This function helps us connect to a database on hadoop2.  The
    userid password information is stored in a local file encoded
    in json format.  
    Parameters:
    filename is where json encoded userid/password information stored
    subset: dictionary key in json file
    defaults: alternative way to connect to database
    Returns: subdictionary based on subset parameter.
    '''
    try:
        with open(filename,'r') as file:
            D = json.load(file)
            file.close()
            if D[subset]:
                return D[subset]
            else:
                return defaults
    except:
        return defaults

def db_setup(userid,password):
    '''
    This function connects us to a database using our login credentials. 
    Paramters:
    userid - string with our userid
    password - string with our password
    database - string with database name
    Returns:
    SQL engine and SQL connection objects
    '''
    template = 'mysql+mysqlconnector://{}:{}@hadoop2.mathsci.denison.edu/'
    cstring = template.format(userid,password)
    engine = sa.create_engine(cstring)
    connection = engine.connect()
    
    return engine, connection

def db_setup2(userid,password,database):
    '''
    This function connects us to a database using our login credentials. 
    Paramters:
    userid - string with our userid
    password - string with our password
    database - string with database name
    Returns:
    SQL engine and SQL connection objects
    '''
    template = 'mysql+mysqlconnector://{}:{}@hadoop2.mathsci.denison.edu/{}'
    cstring = template.format(userid,password,database)
    engine = sa.create_engine(cstring)
    connection = engine.connect()
    
    return engine, connection

def scrapeData(protocol, host, resource):
    '''
    This function takes elements of a URL and returns the raw text from
    the webpage.
    Parameters: protocol, host, resource
    Returns: response text
    '''
    template = "{}://{}{}"
    url = template.format(protocol, host, resource)
    resp = requests.get(url)
    if resp.status_code == 200:
        return resp.text
    print("Error")

def extractSectionBetween(txt, start, end):
    '''
    Slices text by substring boundaries rather than index boundaries.
    Parameters: text to slice, start string, end string
    Returns: trimmed string
    '''
    startInd = txt.find(start)
    endInd = txt.find(end)
    return txt[startInd:endInd]

def recreateTable(txt):
    '''
    Takes text containing a table in raw HTML format and
    places desired data in a pandas data frame.
    Parameters: html text
    Returns: pandas data frame
    '''
    pat = re.compile(r"<td><a .*?>(?P<county>.*?)<\/a>\n<\/td>\n<td .*?>(?P<pop>.*?)\n<\/td>")
    countyid = []
    i = 1
    county = []
    state = []
    pop = []
    for m in pat.finditer(txt):
        county.append(m["county"].split(",")[0].strip())
        state.append(m["county"].split(",")[1].strip())
        pop.append(int(m["pop"].replace(",", "")))
        countyid.append(i)
        i += 1
    return pd.DataFrame({"CountyID":countyid, "County":county, "State":state, "Population":pop})

def makeDataFrame(name, uid, state, city, lon, lat):
    for i in range(len(name)):
        try:
            state[i]=StateMap[state[i]]
        except KeyError:
            state[i]=StateMap['Skyrim']
        for j in name[i]: 
            if (j not in string.printable) or (j in string.punctuation):
                name[i] = 'badName'
                break
    df = pd.DataFrame({"name":name, "uid":uid, "state":state, "city":city, "lon":lon, "lat":lat})
    return df

def superFetch(groupname, key):
    print("Start fetching...")
    client = meetup.api.Client(key)
    print('request status:',end=' ')
    group = client.GetGroup({'urlname':groupname})
    gid = group.id
    total = group.members
    want = 'id,name,state,city,lon,lat'
    limit = 200
    request = total//limit+1
    offsetValue = 0
    name = []
    uid = []
    state = []
    city = []
    lon = []
    lat = []
    while (offsetValue!=request):
        print('progress: ',offsetValue+1,'/',request, sep='', end=' ')
        print('request status:',end=' ')
        try:
            members = client.GetMembers(group_id=gid, only=want,page=limit,offset=offsetValue)
            offsetValue+=1
            rList= members.results
            for j in rList:
                if len(j.keys())==6:
                    name.append(j['name'])
                    uid.append(j['id'])
                    state.append(j['state'])
                    city.append(j['city'])
                    lon.append(j['lon'])
                    lat.append(j['lat'])
                else:
                    uid.append(j['id'])
                    lon.append(j['lon'])
                    lat.append(j['lat'])
                    try:
                        name.append(j['name'])
                    except KeyError:
                        name.append('badName')
                    try:
                        city.append(j['city'])
                    except KeyError:
                        city.append('Whiterun')
                    try:
                        state.append(j['state'])
                    except KeyError:
                        state.append('Skyrim')
        except json.decoder.JSONDecodeError:
            print("JSONDecodeError, retrying")
    print("fetching complete")
    df = makeDataFrame(name, uid, state, city, lon, lat)
    return df

def createDB(db_name):
    makeDB = "CREATE DATABASE "+db_name+";"
    makeCountyPopulations = "CREATE TABLE CountyPopulations(\
    CountyID INT NOT NULL,\
    CountyName VARCHAR(30),\
    StateID INT,\
    Population INT,\
    PRIMARY KEY(CountyID)\
    );"
    makeStates = "CREATE TABLE States(\
    StateID INT NOT NULL,\
    State VARCHAR(30),\
    PRIMARY KEY(StateID)\
    );"
    makeUsers = "CREATE TABLE Users(\
    MemberID INT NOT NULL,\
    Nickname VARCHAR(30),\
    StateID INT,\
    UserCity VARCHAR(30),\
    UserLongitude DECIMAL(5, 2),\
    UserLatitude DECIMAL(5, 2),\
    PRIMARY KEY(MemberID)\
    );"
    connection.execute("commit;")
    connection.execute(makeDB)
    connection.execute("USE "+db_name+";")
    connection.execute(makeCountyPopulations)
    connection.execute(makeStates)
    connection.execute(makeUsers)
    
def insertCounty(df):
    valuesTemplate = '({},{},{},{})'
    countyRowTemplate = "INSERT INTO CountyPopulations(CountyID, CountyName, StateID, Population)VALUES{};"
    valueRows = ""
    for i in range(len(df)):
        countyRow = valuesTemplate.format(df.iloc[i, 0], "'"+df.iloc[i, 1].replace("'", "")+"'", df.iloc[i, 2], df.iloc[i, 3])
        valueRows += "," + countyRow
    insertString = countyRowTemplate.format(valueRows[1:])
    connection.execute(insertString)
    
def insertUser(df):
    valuesTemplate = '({},{},{},{},{},{})'
    usersRowTemplate = "INSERT INTO Users(MemberID, Nickname, StateID, UserCity, UserLongitude, UserLatitude)VALUES{};"
    valueRows = ""
    for i in range(len(df)):
        userRow = valuesTemplate.format(df.iloc[i,1], "'"+df.iloc[i, 0]+"'", df.iloc[i, 2], "'"+df.iloc[i, 3]+"'", df.iloc[i, 4], df.iloc[i, 5])
        valueRows += "," + userRow
    insertString = usersRowTemplate.format(valueRows[1:])
    connection.execute(insertString)

def insertState(StateMap):
    StateList = list(StateMap.keys())
    StateIDList = list(StateMap.values())
    valuesTemplate = '({},{})'
    stateRowTemplate = "INSERT INTO States(StateID, State)VALUES{};"
    valueRows = ""
    for i in range(len(StateList)):
        stateRow = valuesTemplate.format(StateIDList[i], "'"+StateList[i]+"'")
        valueRows += "," + stateRow
    insertString = stateRowTemplate.format(valueRows[1:])
    connection.execute(insertString)
    
def SQLtoPandas(sqlQuery):
    '''
    
    
    
    '''
    queryProxy = connection.execute(sqlQuery)
    pandasDf = pd.DataFrame(queryProxy.fetchall(), columns = queryProxy.keys())
    return pandasDf
    
StateMap = {'AL':1,'AK':2,'AZ':3,'AR':4,'CA':5,'CO':6,'CT':7,'DE':8,'FL':9,'GA':10,'HI':11,'ID':12,'IL':13,'IN':14,
            'IA':15,'KS':16,'KY':17,'LA':18,'ME':19,'MD':20,'MA':21,'MI':22,'MN':23,'MS':24,'MO':25,'MT':26,'NE':27,
            'NV':28,'NH':29,'NJ':30,'NM':31,'NY':32,'NC':33,'ND':34,'OH':35,'OK':36,'OR':37,'PA':38,'RI':39,'SC':40,
            'SD':41,'TN':42,'TX':43,'UT':44,'VT':45,'VA':46,'WA':47,'WV':48,'WI':49,'WY':50,'DC':51,'Skyrim':52}

In [None]:
# key = '1a60703a102a6252424875415c1b21'
# userTable = superFetch('DC-Tech-Meetup', key)
# txt = scrapeData("https", "en.wikipedia.org", "/wiki/Washington_metropolitan_area")
# txt = extractSectionBetween(txt, "92.3 years", "The Washington, D.C. area has the largest science")
# popTable = recreateTable(txt)
# popTable["State"] = popTable["State"].replace("D.C.", StateMap['DC'])
# popTable["State"] = popTable["State"].replace("Maryland", StateMap['MD'])
# popTable["State"] = popTable["State"].replace("Virginia", StateMap['VA'])

In [None]:
# creds = {'user': 'brown_b1', 'password': 'brown_b1'}
# try:
#     connection.close()
#     del engine
# except:
#     pass
# engine, connection = db_setup(creds['user'],creds['password'])

In [None]:
# createDB("brown_b1")
# insertCounty(popTable)
# insertState(StateMap)
# insertUser(userTable)

In [3]:
creds = {'user': 'brown_b1', 'password': 'brown_b1'}
try:
    connection.close()
    del engine
except:
    pass
engine, connection = db_setup2(creds['user'],creds['password'], "brown_b1")

In [6]:
PopQuery = '''
SELECT 
s.State, SUM(cp.Population) AS StatePopulation2016,
ROUND(SUM(cp.Population)/(SELECT SUM(cp.Population) FROM CountyPopulations AS cp)*100, 2) AS Percentage 
FROM CountyPopulations AS cp
INNER JOIN States AS s ON s.StateID = cp.StateID
GROUP BY cp.StateID
ORDER BY cp.StateID;
'''

UserPopQuery = '''
SELECT s.State, COUNT(u.MemberID) AS UserPop,
ROUND(COUNT(u.MemberID)/(SELECT COUNT(u.MemberID) FROM Users AS u)*100, 2) AS Percentage 
FROM Users AS u
INNER JOIN States AS s ON s.StateID = u.StateID
WHERE s.State IN ('VA','MD','DC')
GROUP BY u.StateID
ORDER BY u.StateID;
'''

UserDemographicQuery = '''
SELECT u.MemberID, u.UserLongitude, u.UserLatitude, u.UserCity, s.State
FROM Users AS u
INNER JOIN States AS s ON s.stateID = u.stateID
WHERE s.State IN ('VA','MD','DC');
'''

comparisonQuery = '''
SELECT s.State, actual.pop AS StatePopulation2016,actual.percentage AS ActualPct, api.pop AS UserPop,api.percentage AS MeetUpPct FROM States as s
INNER JOIN (SELECT s.StateID,SUM(cp.Population) AS pop,ROUND(SUM(cp.Population)/(SELECT SUM(cp.Population) FROM CountyPopulations AS cp)*100, 2) 
AS Percentage FROM CountyPopulations as cp
INNER JOIN States AS s ON s.StateID = cp.StateID
GROUP BY cp.StateID
ORDER BY cp.StateID)
AS actual ON s.StateID = actual.StateID
INNER JOIN (SELECT s.StateID,COUNT(u.MemberID) AS pop,ROUND(COUNT(u.MemberID)/(SELECT COUNT(u.MemberID) FROM Users AS u)*100, 2)
AS Percentage FROM Users AS u
INNER JOIN States AS s ON s.StateID = u.StateID
GROUP BY u.StateID
ORDER BY u.StateID) 
AS api ON s.StateID = api.StateID
ORDER BY s.StateID;
'''

actualPopDf = SQLtoPandas(PopQuery)
UserPopDf = SQLtoPandas(UserPopQuery)
UserDemographicDf = SQLtoPandas(UserDemographicQuery)
comparisonDf = SQLtoPandas(comparisonQuery)
pop = comparisonDf.melt(id_vars='State',value_vars=['StatePopulation2016','UserPop'],var_name='Type',value_name='Population')
pct = comparisonDf.melt(id_vars='State',value_vars=['ActualPct','MeetUpPct'],var_name='Type',value_name='Percentage')
for index,rows in pop.iterrows():
    if rows['Type'] == 'StatePopulation2016':
        pop.loc[[index],['Type']] = 'Actual'
    else:
        pop.loc[[index],['Type']] = 'Meetup'      
for index,rows in pct.iterrows():
    if rows['Type'] == 'ActualPct':
        pct.loc[[index],['Type']] = 'Actual'
    else:
        pct.loc[[index],['Type']] = 'Meetup'
comparisonDf = pd.merge(pop, pct, on=['State','Type'])

UserPopDf.to_csv('UserPopDf.csv')
UserDemographicDf.to_csv("UserDemographicDf.csv")

In [7]:
print(actualPopDf)
print(UserPopDf)
print(UserDemographicDf)
print(comparisonDf)

  State StatePopulation2016 Percentage
0    MD             2448459      40.30
1    VA             2945980      48.49
2    DC              681170      11.21
  State  UserPop Percentage
0    MD     3689      14.81
1    VA     5446      21.87
2    DC    12754      51.22
        MemberID UserLongitude UserLatitude       UserCity State
0           1406        -77.01        38.99    Takoma Park    MD
1          27938        -77.04        38.92     Washington    DC
2          30158        -77.19        38.77    Springfield    VA
3          41054        -77.07        38.91     Washington    DC
4          63039        -77.40        38.93        Herndon    VA
5          93409        -77.04        38.92     Washington    DC
6         100169        -77.02        38.91     Washington    DC
7         105957        -77.10        38.86      Arlington    VA
8         119753        -76.99        38.91     Washington    DC
9         127739        -77.05        38.90     Washington    DC
10        131848 

In [8]:
try:
    connection.close()
    del engine
    print("Connection closed")
except:
    pass
    print("No connection")

Connection closed


## Results

## Conclusion