In [43]:
import json
import pandas as pd
import ast
import datetime

In [47]:
def convert_unixtime(utime):
    return (datetime.datetime.fromtimestamp(int(utime)).strftime('%d-%m'))

def convert_country_code(code):
    mapping = {"CO":"Colombia", "BR": "Brazil", "VE": "Venezuela", "PE": "Peru", "PY": "Paraguay", "PA": "Panama", "CL": "Chile"}
    return mapping[code]
    

In [87]:
#infile = "collections/colombia/dataframe_collected_finished_1527055300.csv" # Collected on 23 may, cities and state level
#infile = "collections/colombia/dataframe_collected_finished_1526754549.csv" # Collected on 19 may, cities and state
infile = "collections/colombia/dataframe_collected_finished_1527086071.csv" # From May 23 colombian state
#infile = "collections/colombia/dataframe_collected_finished_1527262566.csv" # From May 25th

df = pd.read_csv(infile)
df["CollectionDay"] = df["timestamp"].apply(lambda x: convert_unixtime(x))

print("Collection made in %s" % (df["CollectionDay"].head()[0]))

Collection made in 23-05


In [93]:
def extract_relationship(d):
    if 1 in d:
        return "single"
    elif 2 in d:
        return "dating"
    elif 3 in d:
        return "married"
    else:
        return None

def extract_education(d):
    if d == [3, 7, 8, 9, 11]:
        return "graduated"
    elif d == [1, 12, 13]:
        return "no_degree"
    elif d == [2, 4, 5, 6, 10]:
        return "high_school"
    elif d == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        return "all"
    else:
        return None

def extract_group(d):
    id = d[0]["id"]
    if id == 6026404871583:
        return "Expats (Venezuela)"
    elif id == 6015559470583:
        return "Ex-pats (All)"
    else:
        return "All"

def agebuckets(minage, maxage):
    if minage == 13 and (maxage is None or np.isnan(maxage)):
        return "all"
    elif minage == 13 and maxage == 18:
        return "adolecent"
    elif minage == 19 and maxage == 25:
        return "young_adult"
    elif minage == 26 and maxage == 40:
        return "adult"
    elif minage == 41 and maxage == 65:
        return "middle_age"
    elif minage == 65 and (maxage is None or np.isnan(maxage)):
        return "elder"
    return "undefined"
    
def expand(row):
    place = None
    loc_dimension = None
    if "regions" in row["geo_locations"]:
        place = "%s, %s" % (row["geo_locations"]["regions"][0]["name"], convert_country_code(row["geo_locations"]["regions"][0]["country_code"]))
        loc_dimension = "State"
    elif "countries" in row["geo_locations"]:
        place = convert_country_code(row["geo_locations"]["countries"][0])
        loc_dimension = "Country"
    elif "cities" in row["geo_locations"]:
        place = "%s, %s, %s" % (row["geo_locations"]["cities"][0]["name"], row["geo_locations"]["cities"][0]["region"], convert_country_code(row["geo_locations"]["cities"][0]["country"]))
        loc_dimension = "City"
    
    loctype = "_".join(row["geo_locations"]["location_types"])

    relationship, education, group = None, None, None
    for dimension in row["flexible_spec"]:
        if "relationship_statuses" in dimension:
            relationship = extract_relationship(dimension["relationship_statuses"])
        elif "education_statuses" in dimension:
            education = extract_education(dimension["education_statuses"])
        elif "behaviors" in dimension:
            group = extract_group(dimension["behaviors"])
    
    gender = row["genders"][0]
    gender = "both" if gender == 0 else "man" if gender == 1 else "woman"
    
    return row["age_min"], row["age_max"], place, loc_dimension, loctype, gender, relationship, education, group

def get_item(x):
    if not x:
        return None
    return x["name"]

df[["MinAge","MaxAge","Location","LocationHierarch","LocationType","Gender","Relationship","Educaton","Group"]] = df["targeting"].apply(lambda x : expand(ast.literal_eval(x))).apply(pd.Series)

if "citizenship" in df:
    df["Group"] = df["citizenship"].fillna("[]").apply(lambda x : get_item(ast.literal_eval(x)))
    print("Updating Group information...")
    
if "access_device" in df:
    df["Device"] = df["access_device"].fillna("[]").apply(lambda x : get_item(ast.literal_eval(x)))
    print("Adding information regarding devices...")
    
df["agebucket"] = df[["MinAge","MaxAge"]].apply(lambda x: agebuckets(x["MinAge"], x["MaxAge"]), axis=1)

# Brief description: 
# -----------------
# Gender: 0 Both, 1 Man, 2 Woman
# Age: 13-Null, 13-18, 19-25, 26-40, 40,65, 65+
# LocationType: home_recent, home, recent


In [100]:
def cut(df, col, values, savedcols= ['Location', 'dau_audience', 'mau_audience']):
    df_slice = {}
    
    for i, v in enumerate(values):
        df_slice[i] = df[df[col] == v][savedcols]
    
    merge = pd.merge(df_slice[0], df_slice[1], on=["Location"], suffixes=("_%s" % (values[0]), "_%s" % (values[1])))
    for i in range(2, len(values)):
        #print("Adding suffix for:" + values[i])
        merge = pd.merge(merge, df_slice[i], on=["Location"], suffixes=("", "_%s" % (values[i])))
        
    if len(values) > 2:
        # Fix information for the second group.
        merge["audience_%s" % (values[2])] = merge["audience"]
        del merge["audience"]
    return merge   

def copy_rename(df, oldname, newname):
    return df.rename(columns={oldname: newname}).copy() 

def get_slice(dfin, col, values, frequency="mau"):
    dfout = copy_rename(dfin, '%s_audience' % (frequency), "audience")
    dfout = cut(dfout, col, values, savedcols=['Location', 'audience'])
    dfout["Frequency"] = "Daily" if frequency == "dau" else "Monthly"
    return dfout


In [95]:
dfgender = {}
dfcut = df[(df["agebucket"] == "all") & (df["Group"] == "Expats (Venezuela)") & (df["Gender"].apply(lambda x : x in ["man","woman"])) & (df["LocationType"] == "home_recent")].copy()
dfgender["mau"] = get_slice(dfcut, "Gender", ["man","woman"], frequency="mau")
dfgender["dau"] = get_slice(dfcut, "Gender", ["man","woman"], frequency="dau")

In [101]:
dfage = {}
dfcut = df[ (df["agebucket"].apply(lambda x : x in ["adolecent","young_adult","adult","middle_age", "elder"])) &
    (df["Group"] == "Expats (Venezuela)") & (df["Gender"] == "both") & (df["LocationType"] == "home_recent") ].copy()
dfage["mau"] = get_slice(dfcut, "agebucket", ["adolecent","young_adult","adult","middle_age", "elder"], frequency="mau") # Adult is missing...need to check why.
dfage["dau"] = get_slice(dfcut, "agebucket", ["adolecent","young_adult","adult","middle_age", "elder"], frequency="dau")

In [102]:
dfloctype = {}
dfcut = df[(df["agebucket"] == "all") &  (df["Group"] == "Expats (Venezuela)") & (df["Gender"] == "both") & (df["LocationType"].apply(lambda x : x in ["home","recent"]))]
dfloctype["mau"] = get_slice(dfcut, "LocationType", ["home","recent"], frequency="mau")
dfloctype["dau"] = get_slice(dfcut, "LocationType", ["home","recent"], frequency="dau")

In [109]:
for dfopt in [dfgender, dfage, dfloctype]:
    dfopt["dau"] = dfopt["dau"].drop_duplicates()
    dfopt["mau"] = dfopt["mau"].drop_duplicates()

In [130]:
merged = {}

merged["mau"] = pd.merge(dfgender["mau"], dfloctype["mau"]).merge(dfage["mau"])
merged["dau"] = pd.merge(dfgender["dau"], dfloctype["dau"]).merge(dfage["dau"])

concated = pd.concat([merged["mau"], merged["dau"]])

venezuelans = df[(df["MinAge"] == 13) & (df["MaxAge"].isnull()) & (df["Group"] == "Expats (Venezuela)") & (df["Gender"] == "both") & (df["LocationType"] == "home_recent")]
allmigrants = df[(df["MinAge"] == 13) & (df["MaxAge"].isnull()) & (df["Group"] == "Ex-pats (All)") & (df["Gender"] == "both") & (df["LocationType"] == "home_recent")]
allpopulation = df[(df["MinAge"] == 13) & (df["MaxAge"].isnull()) & (df["Group"].isnull()) & (df["Gender"] == "both") & (df["LocationType"] == "home_recent")]

dffinal = pd.merge(venezuelans[["dau_audience","mau_audience","Location","LocationHierarch"]], concated)
dffinal["venezuelans"] = dffinal["mau_audience"].where( dffinal["Frequency"]=="Monthly", dffinal["dau_audience"]) 
del dffinal["mau_audience"]
del dffinal["dau_audience"]

dffinal = pd.merge(allmigrants[["dau_audience","mau_audience","Location","LocationHierarch"]], dffinal)
dffinal["allmigrants"] = dffinal["mau_audience"].where( dffinal["Frequency"]=="Monthly", dffinal["dau_audience"]) 
del dffinal["mau_audience"]
del dffinal["dau_audience"]

dffinal = pd.merge(allpopulation[["dau_audience","mau_audience","Location","LocationHierarch"]], dffinal)
dffinal["allpopulation"] = dffinal["mau_audience"].where( dffinal["Frequency"]=="Monthly", dffinal["dau_audience"]) 
del dffinal["mau_audience"]
del dffinal["dau_audience"]

dffinal["ven/migrants"] = dffinal["venezuelans"] / dffinal["allmigrants"]
dffinal["ven/pop"] = dffinal["venezuelans"] / dffinal["allpopulation"]

for trans_col in ["ven/migrants", "ven/pop"]:
    dffinal[trans_col] = dffinal[trans_col].apply(lambda x: "%.4f" % x)

dffinal.to_csv("merged.csv", index=False)