In [90]:
import pandas as pd
import re
import json
from shapely.geometry import Point, shape

In [91]:
n_samples = 10000

In [92]:
def get_age_group(gender_age):
    return re.sub(r"[MF]","",gender_age)

def get_location(longitude, latitude, provinces_json):
    point = Point(longitude, latitude)
    for record in provinces_json["features"]:
        polygon = shape(record["geometry"])
        if polygon.contains(point):
            return record["properties"]["name"]
    return "Other"

In [93]:
with open("geojson/china_provinces_en.json") as data_file:
    provinces_json = json.load(data_file)

In [94]:
gen_age = pd.read_csv("gender_age_train.csv")
events = pd.read_csv("events.csv")
phone = pd.read_csv("phone_brand_device_model.csv")

In [95]:
gen_age.head()

Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [96]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [97]:
phone.head()

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


In [98]:
%%time
    df = gen_age.merge(events, how="left", on="device_id")
    df = df.merge(phone, how="left", on="device_id")
    df.drop(["device_id", "age", "event_id", "device_model"], axis=1, inplace=True)
    df_clean = df[df["longitude"] != 0].dropna()
    df2 = df_clean.sample(n=n_samples)

top_brands = {"华为":"Huawei", "小米":"Xiaomi", "三星":"Samsung", "vivo":"vivo", "OPPO":"OPPO",
            "魅族":"Meizu", "酷派":"Coolpad", "乐视":"LeEco", "联想":"Lenovo", "HTC":"HTC"}

    df2["phone_brand"] = df2["phone_brand"].apply(lambda i: top_brands[i] if (i in top_brands) else "Other")
    df2["group"] = df2["group"].apply(lambda i: get_age_group(i))
    df2["location"] = df2.apply(lambda row: get_location(row["longitude"], row["latitude"], provinces_json), axis=1)

CPU times: user 1min 4s, sys: 443 ms, total: 1min 5s
Wall time: 1min 5s
