In [1]:
import pandas as pd
import os
from pd_tools import split_train_test, get_part_data
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
from sklearn.feature_selection import VarianceThreshold

# Using mobile device behavior data to predict users' gender and age

# split dataset

In [3]:
all_gender_age = pd.read_csv(r"\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\gender_age.csv")

In [4]:
df_train, df_test = split_train_test(all_gender_age)

In [5]:
print('data numbers in trainset：', df_train.groupby('group').size())

data numbers in trainset： group
F23-      4040
F24-26    3352
F27-28    2494
F29-32    3702
F33-42    4448
F43+      3355
M22-      5990
M23-26    7684
M27-28    4356
M29-31    5847
M32-38    7580
M39+      6864
dtype: int64


In [6]:
print('data numbers in testset：', df_test.groupby('group').size())

data numbers in testset： group
F23-      1010
F24-26     838
F27-28     624
F29-32     926
F33-42    1113
F43+       839
M22-      1498
M23-26    1921
M27-28    1089
M29-31    1462
M32-38    1896
M39+      1717
dtype: int64


# strore trainset and testset

In [7]:
df_train.to_csv(r"\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\gender_age_train.csv",index=False)

In [8]:
df_test.to_csv(r"\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\gender_age_test.csv",index=False)

load data and choose partial data to run model

In [9]:
gender_age_train = pd.read_csv(r"\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\gender_age_train.csv",index_col='device_id')
gender_age_test = pd.read_csv(r"\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\gender_age_test.csv",index_col='device_id')

In [10]:
gender_age_train

Unnamed: 0_level_0,level_0,index,gender,age,group
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-8076087639492063270,0,0,M,35,M32-38
-2897161552818060146,1,1,M,35,M32-38
-8260683887967679142,2,2,M,35,M32-38
236877999787307864,3,6,M,36,M32-38
-8098239495777311881,4,7,M,38,M32-38
176515041953473526,5,8,M,33,M32-38
6352067998666467520,6,13,M,32,M32-38
4380872794486415327,7,16,M,38,M32-38
-1039701474753771322,8,17,M,38,M32-38
6287938418661076759,9,19,M,33,M32-38


In [11]:
percent = 0.5
gender_age_train = get_part_data(gender_age_train, percent=percent)
gender_age_test = get_part_data(gender_age_test, percent=percent)

In [12]:
phone_brand_device_model = pd.read_csv(r"C:\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\phone_brand_device_model.csv")

# remove duplicate data

In [13]:
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id').set_index('device_id')

In [14]:
events = pd.read_csv(r"C:\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\events.csv",
                     usecols=['device_id', 'event_id'], index_col='event_id')

  mask |= (ar1 == a)


In [15]:
app_events = pd.read_csv(r"C:\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\app_events.csv",
                             usecols=['event_id', 'app_id'])

In [16]:
app_labels = pd.read_csv(r"C:\Users\akahkf\csye7245_final\dataset\Talking Data\dataset\app_labels.csv")

# feature engineering
# phone brand feature engineering
# change brand to numbers using lable encoder

In [17]:
brand_label_encoder = LabelEncoder()
brand_label_encoder.fit(phone_brand_device_model['phone_brand'].values)
phone_brand_device_model['brand_label_code'] = \
        brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values)

In [18]:
gender_age_train['brand_label_code'] = phone_brand_device_model['brand_label_code']
gender_age_test['brand_label_code'] = phone_brand_device_model['brand_label_code']

# change brand numbers to onehot number using onehotEncoder

In [19]:
brand_onehot_encoder = OneHotEncoder()

brand_onehot_encoder.fit(phone_brand_device_model['brand_label_code'].values.reshape(-1, 1))

train_brand_feat = brand_onehot_encoder.transform(gender_age_train['brand_label_code'].values.reshape(-1, 1))

test_brand_feat = brand_onehot_encoder.transform(gender_age_test['brand_label_code'].values.reshape(-1, 1))

In [20]:
print("phone brand feature dim", train_brand_feat.shape[1])

phone brand feature dim 131


# get phone model feature and change model to one hot number

In [21]:
#  merge the phone brand and phone mode string 

In [22]:
phone_brand_device_model['brand_model'] = \
        phone_brand_device_model['phone_brand'].str.cat(phone_brand_device_model['device_model'])

In [23]:
# change model to numbers using lable encoder

In [24]:
model_label_encoder = LabelEncoder()

In [25]:
model_label_encoder.fit(phone_brand_device_model['brand_model'].values)

LabelEncoder()

In [26]:
phone_brand_device_model['brand_model_label_code'] = \
        model_label_encoder.transform(phone_brand_device_model['brand_model'].values)

In [27]:
gender_age_train['brand_model_label_code'] = phone_brand_device_model['brand_model_label_code']

In [28]:
gender_age_test['brand_model_label_code'] = phone_brand_device_model['brand_model_label_code']

In [29]:
# change model numbers to onehot number using onehotEncoder

In [30]:
model_onehot_encoder = OneHotEncoder()

In [31]:
model_onehot_encoder.fit(phone_brand_device_model['brand_model_label_code'].values.reshape(-1, 1))

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [32]:
train_model_feat = model_onehot_encoder.transform(gender_age_train['brand_model_label_code'].values.reshape(-1, 1))

In [33]:
test_model_feat = model_onehot_encoder.transform(gender_age_test['brand_model_label_code'].values.reshape(-1, 1))

In [34]:
print('phone model feature dim', train_model_feat.shape[1])

phone model feature dim 1667


# app download feature

In [35]:
device_app = app_events.merge(events, how='left', left_on='event_id', right_index=True)

# the total number of  running apps numbers

In [36]:
total_run_s = device_app['app_id'].groupby(device_app['device_id']).size()

# how many apps run in one time in device

In [37]:
number_app_onetime = device_app['app_id'].groupby(device_app['device_id']).nunique()

In [38]:
gender_age_train['n_run'] = total_run_s
gender_age_train['n_app'] = number_app_onetime

# fill out the missing value

In [39]:
gender_age_train['n_run'].fillna(0, inplace=True)

In [40]:
gender_age_train['n_app'].fillna(0, inplace=True)

In [41]:
gender_age_test['n_run'] = total_run_s
gender_age_test['n_app'] = number_app_onetime

In [42]:
gender_age_test['n_run'].fillna(0, inplace=True)

In [43]:
gender_age_test['n_app'].fillna(0, inplace=True)

In [44]:
train_run_feat = gender_age_train['n_run'].values.reshape(-1, 1)

In [45]:
train_app_feat = gender_age_train['n_app'].values.reshape(-1, 1)

In [46]:
test_run_feat = gender_age_test['n_run'].values.reshape(-1, 1)
test_app_feat = gender_age_test['n_app'].values.reshape(-1, 1)

# merge feature

In [47]:
train_feat = np.hstack((train_brand_feat.toarray(), train_model_feat.toarray(), train_run_feat, train_app_feat))

In [49]:
test_feat = np.hstack((test_brand_feat.toarray(), test_model_feat.toarray(), test_run_feat, test_app_feat))

In [None]:
# Normalization of feature range

In [50]:
scaler = StandardScaler()
train_feat_scaled = scaler.fit_transform(train_feat)

In [51]:
test_feat_scaled = scaler.transform(test_feat)

In [None]:
#choose feature using variance to choose feature

In [52]:
select = VarianceThreshold(threshold=(.8 * (1 - .8)))

In [53]:
train_feat_scaled_select = select.fit_transform(train_feat_scaled)

In [55]:
test_feat_scaled_select = select.transform(test_feat_scaled)

In [None]:
# pca Principal Component Analysis to decrase dimension to remove some error

In [56]:
pca = PCA(n_components=0.95) # keep have 95% contribution rate feature

In [57]:
train_feat_scaled_select_pca = pca.fit_transform(train_feat_scaled_select)

In [58]:
test_feat_scaled_select_pca = pca.transform(test_feat_scaled_select)

In [59]:
print("finish feature engineering")

finish feature engineering


In [60]:
train_feat_scaled_select_pca.shape[1] #After processing,  feature dimension of each sample

1167

In [None]:
# put label to data

In [61]:
group_label_encoder = LabelEncoder()

In [62]:
group_label_encoder.fit(gender_age_train['group'].values)

LabelEncoder()

In [63]:
y_train = group_label_encoder.transform(gender_age_train['group'].values)

In [64]:
y_test = group_label_encoder.transform(gender_age_test['group'].values)

In [None]:
# finish label to date and do svm and regression adlgorithm

In [65]:
from sklearn.model_selection import GridSearchCV

In [66]:
# using cross validation to get best model and default cv is 5
def get_best_model(model, X_train, y_train, params, cv=5):  
    clf = GridSearchCV(model, params, cv=cv)
    clf.fit(X_train, y_train)
    return clf.best_estimator_                                 

In [67]:
param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}] #Specify 6  hyperparameters to select the best model less  number the stronger standarization

In [68]:
model = LogisticRegression()

In [70]:
bestmodel = get_best_model(model,train_feat_scaled_select_pca, y_train,param_grid, cv=3)

In [72]:
y_pred_lr = bestmodel.predict_proba(test_feat_scaled_select_pca)

In [73]:
log_loss(y_test, y_pred_lr)

2.5588730537839792

In [None]:
# svm model implement

In [74]:
svm_param_grid = [{'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},]

In [None]:
# set probility true so we can get the Predicted Probability

In [75]:
svm_model = svm.SVC(probability=True)

In [76]:
best_svm_model = get_best_model(svm_model,train_feat_scaled_select_pca,y_train,svm_param_grid, cv=3)

In [77]:
y_pred_svm = best_svm_model.predict_proba(test_feat_scaled_select_pca)

In [78]:
log_loss(y_test, y_pred_svm)

2.417528026875335