# Joint notebook - kaggle phone project

First we import the necessary libraries:

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns

Here we import the data from the appropiate folder. This is different for the two members of the team so we wrapped it in a try block to account for this difference.

In [4]:
try:
    app_events = pd.read_csv('../talkingdata_DATA/app_events.csv')
    brand_df = pd.read_csv('../talkingdata_DATA/phone_brand_device_model.csv')
    label_categories = pd.read_csv('../talkingdata_DATA/label_categories.csv')
    app_labels = pd.read_csv('../talkingdata_DATA/app_labels.csv')
    events = pd.read_csv('../talkingdata_DATA/events.csv')
    gender_age_train = pd.read_csv('../talkingdata_DATA/gender_age_train.csv')
    gender_age_test = pd.read_csv('../talkingdata_DATA/gender_age_test.csv')
except FileNotFoundError:
    app_events = pd.read_csv('data/app_events.csv', nrows = 1000000)
    brand_df = pd.read_csv('data/phone_brand_device_model.csv')
    label_categories = pd.read_csv('data/label_categories.csv')
    app_labels = pd.read_csv('data/app_labels.csv')
    events = pd.read_csv('data/events.csv', nrows = 1000000)
    gender_age_train = pd.read_csv('data/gender_age_train.csv')
    gender_age_test = pd.read_csv('data/gender_age_test.csv')

To get a feeling of how the data is structured we show the first rows for each dataframe

In [5]:
display(gender_age_train.head()) # want to predict group, involves multiclass classification

Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [6]:
display(app_events.head()) # event_id corresponds to that in events dataframe

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,1
1,2,-5720078949152207372,1,0
2,2,-1633887856876571208,1,0
3,2,-653184325010919369,1,1
4,2,8693964245073640147,1,1


In [7]:
display(events.head())
display(brand_df.head())

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


### Clean app_events and events dataframes: 

In [8]:
# convert timestamp column to pd.datetime
events['timestamp'] = events['timestamp'].map(pd.to_datetime)

### Clean phone brand dataframe (brand_df)

In [9]:
display(brand_df.head()) # dataframe pre-cleanup

Unnamed: 0,device_id,phone_brand,device_model
0,-8890648629457979026,小米,红米
1,1277779817574759137,小米,MI 2
2,5137427614288105724,三星,Galaxy S4
3,3669464369358936369,SUGAR,时尚手机
4,-5019277647504317457,三星,Galaxy Note 2


As part of the cleanup we drop possible duplicate entries and (ALEX: dont fully understand whats happening here with the reset_index)

In [10]:
# dictionary of brand categories and their associated numbers/indices
brand_categories = brand_df[['phone_brand']].drop_duplicates().reset_index(drop=True).reset_index()
brand_categories = brand_categories.rename(columns = {'index': 'phone_brand_id'})

# Creating a reference table that gathers the informations in the multiples tables

Merge the tables into one general table so we can access the data more easily

In [13]:
complete_ref_table = app_events
complete_ref_table = complete_ref_table.merge(app_labels, on = "app_id")
complete_ref_table = complete_ref_table.merge(label_categories, on = "label_id")
del complete_ref_table['is_installed']  # I dont think is_installed adds any information
complete_ref_table = complete_ref_table.merge(events, on="event_id")
complete_ref_table = complete_ref_table.merge(brand_df, on="device_id")
complete_ref_table = complete_ref_table[["event_id", "app_id", "label_id", "category", "device_id", "phone_brand", "device_model", "timestamp", "longitude", "latitude", "is_active"]]

display(complete_ref_table.head())

Unnamed: 0,event_id,app_id,label_id,category,device_id,phone_brand,device_model,timestamp,longitude,latitude,is_active
0,2,5927333115845830913,549,Property Industry 1.0,-6401643145415154744,三星,Galaxy Grand Prime,2016-05-01 00:54:12,103.65,30.97,1
1,2,8693964245073640147,549,Property Industry 1.0,-6401643145415154744,三星,Galaxy Grand Prime,2016-05-01 00:54:12,103.65,30.97,1
2,2,4775896950989639373,549,Property Industry 1.0,-6401643145415154744,三星,Galaxy Grand Prime,2016-05-01 00:54:12,103.65,30.97,1
3,2,-8022267440849930066,549,Property Industry 1.0,-6401643145415154744,三星,Galaxy Grand Prime,2016-05-01 00:54:12,103.65,30.97,0
4,2,9112463267739110219,549,Property Industry 1.0,-6401643145415154744,三星,Galaxy Grand Prime,2016-05-01 00:54:12,103.65,30.97,0


# Creating training set

Once we have this reference table we can build our training set, assigning to every entry of that table the corresponding age and gender for the user

In [14]:
# Build training set
train_set = (complete_ref_table.merge(gender_age_train, on="device_id")).head(3500000)
test_set = complete_ref_table.merge(gender_age_train, on="device_id").tail(200000)
test_set.head()

Unnamed: 0,event_id,app_id,label_id,category,device_id,phone_brand,device_model,timestamp,longitude,latitude,is_active,gender,age,group
2294132,69857,33792862810792679,252,Wealth Management,-8649941211508968787,小米,红米,2016-05-04 00:20:57,0.0,0.0,1,M,62,M39+
2294133,69857,3683147815759994238,252,Wealth Management,-8649941211508968787,小米,红米,2016-05-04 00:20:57,0.0,0.0,1,M,62,M39+
2294134,69857,6965654211116534216,252,Wealth Management,-8649941211508968787,小米,红米,2016-05-04 00:20:57,0.0,0.0,0,M,62,M39+
2294135,69857,6965654211116534216,761,Financial Services,-8649941211508968787,小米,红米,2016-05-04 00:20:57,0.0,0.0,0,M,62,M39+
2294136,69857,502665604573257504,761,Financial Services,-8649941211508968787,小米,红米,2016-05-04 00:20:57,0.0,0.0,1,M,62,M39+
