In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import csv
from sklearn.preprocessing import add_dummy_feature,OrdinalEncoder
from sklearn.feature_selection import SelectKBest,RFE
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
import warnings
from sklearn import preprocessing
from mlxtend.classifier import StackingCVClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn import model_selection
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression


In [2]:
### Reading the Train event data
df_events_with = pd.read_csv("train_with_event.csv", dtype={'device_id': np.str, 'latitude': np.float, 'longitude':np.float,'event_id': np.str})
### Reading the APP Event Data 
df_events_without = pd.read_csv("train_without_event.csv", dtype={'device_id': np.str, 'latitude': np.float, 'longitude':np.float,'event_id': np.str})
df_app_event = pd.read_csv('app_events.csv',dtype={'event_id': np.str, 'app_id': np.str, 'is_active':np.int64,'is_installed':np.int64})
df_mobile_brand = pd.read_csv('train_mobile_brand.csv',dtype={'device_id': np.str})
df_app_metadata = pd.read_csv('df_app_metadata.csv')


In [3]:
df_events_with.head()

Unnamed: 0.1,Unnamed: 0,device_id,gender,age,group_train,event_id,datetimestamp,latitude,longitude,event_day,latitude_difference,longitude_difference,latitude_medium,longitude_medium,cluster_label
0,0,-7548291590301750000,M,33,M32+,2369465,2016-05-03 15:55:35,33.98,116.79,3.0,0.0,0.0,33.98,116.79,-1
1,1,-7548291590301750000,M,33,M32+,1080869,2016-05-03 06:07:16,33.98,116.79,3.0,0.0,0.0,33.98,116.79,-1
2,2,-7548291590301750000,M,33,M32+,1079338,2016-05-04 03:28:02,33.98,116.79,4.0,0.0,0.0,33.98,116.79,-1
3,3,-7548291590301750000,M,33,M32+,1078881,2016-05-04 02:53:08,33.98,116.79,4.0,0.0,0.0,33.98,116.79,-1
4,4,-7548291590301750000,M,33,M32+,1068711,2016-05-03 15:59:35,33.98,116.79,3.0,0.0,0.0,33.98,116.79,-1


In [4]:
df_app_metadata.head()

Unnamed: 0.1,Unnamed: 0,app_id,label_id,category
0,0,app_id,label_id,Other
1,1,7324884708820027918,251,Financial Services
2,2,-4494216993218550286,251,Financial Services
3,3,6058196446775239644,406,Unknown
4,4,6058196446775239644,407,Financial Services


In [5]:
df_app_metadata = df_app_metadata.loc[1:,:] ### One row is header only removing from the dataframe

In [6]:
df_app_metadata.category.nunique()

46

###  various aggregations and joins at a device_id level

In [7]:
merged_df = df_events_with.merge(df_app_event, on='event_id')
df_events_with = merged_df.merge(df_mobile_brand[['device_id','phone_brand']], on='device_id')


In [8]:
df_events_with.head()

Unnamed: 0.1,Unnamed: 0,device_id,gender,age,group_train,event_id,datetimestamp,latitude,longitude,event_day,latitude_difference,longitude_difference,latitude_medium,longitude_medium,cluster_label,app_id,is_installed,is_active,phone_brand
0,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,3433289601737013244,1,1,Huawei
1,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,-5472633337921616096,1,1,Huawei
2,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,9112463114311278255,1,0,Huawei
3,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,8693964245073640147,1,1,Huawei
4,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,5099453940784075687,1,1,Huawei


###  Create the Age Group as Label for output

In [9]:
# Define the age groups
bins = [0, 24, 32, 45, 150]
labels = ['0-24', '25-32', '33-45', '46+']

# Categorize the ages into groups
df_events_with['age_group'] = pd.cut(df_events_with['age'], bins=bins, labels=labels, right=False)

In [10]:
df_events_with.head()

Unnamed: 0.1,Unnamed: 0,device_id,gender,age,group_train,event_id,datetimestamp,latitude,longitude,event_day,latitude_difference,longitude_difference,latitude_medium,longitude_medium,cluster_label,app_id,is_installed,is_active,phone_brand,age_group
0,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,3433289601737013244,1,1,Huawei,33-45
1,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,-5472633337921616096,1,1,Huawei,33-45
2,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,9112463114311278255,1,0,Huawei,33-45
3,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,8693964245073640147,1,1,Huawei,33-45
4,64,-7548291590301750000,M,33,M32+,1141870,2016-05-01 10:08:16,33.98,116.79,1.0,0.0,0.0,33.98,116.79,-1,5099453940784075687,1,1,Huawei,33-45


### Merge wiith app metadata

In [11]:
df_events_with = df_events_with.merge(df_app_metadata[['app_id','category']], on='app_id')


In [12]:
df_events_with.shape

(25449323, 21)

In [13]:
duplicates = df_events_with.duplicated()

# Count duplicate rows
duplicates.value_counts()


False    22552241
True      2897082
dtype: int64

### Drop the duplicate 

In [14]:
df_events_with.drop_duplicates(inplace=True)


In [15]:
df_events_with.columns

Index(['Unnamed: 0', 'device_id', 'gender', 'age', 'group_train', 'event_id',
       'datetimestamp', 'latitude', 'longitude', 'event_day',
       'latitude_difference', 'longitude_difference', 'latitude_medium',
       'longitude_medium', 'cluster_label', 'app_id', 'is_installed',
       'is_active', 'phone_brand', 'age_group', 'category'],
      dtype='object')

### Drop the column which is require 

In [16]:
df_events_with.drop(['Unnamed: 0','event_id','age','group_train','datetimestamp','latitude','longitude','app_id','event_day'],axis= 1 , inplace=True)

In [17]:
df_encoded = pd.get_dummies(df_events_with, columns=['phone_brand', 'phone_brand'])


In [18]:
df_encoded.head()

Unnamed: 0,device_id,gender,latitude_difference,longitude_difference,latitude_medium,longitude_medium,cluster_label,is_installed,is_active,age_group,...,phone_brand_samsung,phone_brand_vivo,phone_brand_weimi,phone_brand_weitu,phone_brand_wpf,phone_brand_xiangmi,phone_brand_ximi,phone_brand_yougo,phone_brand_youmi,phone_brand_yuxin
0,-7548291590301750000,M,0.0,0.0,33.98,116.79,-1,1,1,33-45,...,0,0,0,0,0,0,0,0,0,0
1,-7548291590301750000,M,0.0,0.0,33.98,116.79,-1,1,1,33-45,...,0,0,0,0,0,0,0,0,0,0
2,7442042493953950000,M,0.0,0.0,39.43,116.98,-1,1,1,25-32,...,0,0,0,0,0,0,0,0,0,0
3,7442042493953950000,M,0.0,0.0,39.43,116.98,-1,1,1,25-32,...,0,0,0,0,0,0,0,0,0,0
4,-800490591774117000,F,0.0,0.0,0.0,0.0,-1,1,1,33-45,...,0,0,0,0,0,0,0,0,0,0


In [19]:
train_test_data = pd.read_csv('train_test_split.csv',dtype={'device_id': np.str})

In [20]:
train_test_data.dtypes

device_id          object
gender             object
age                 int64
group              object
train_test_flag    object
dtype: object

In [21]:
df_encoded = df_encoded.merge(train_test_data[['device_id','train_test_flag']], on='device_id')


In [23]:
train_data = df_encoded[df_encoded['train_test_flag'] == 'train']
test_data = df_encoded[df_encoded['train_test_flag'] == 'test']

In [24]:
print("Shape of training data",train_data.shape)
print("Shape of test data",test_data.shape)

Shape of training data (16988233, 174)
Shape of test data (5564008, 174)


In [None]:
train_data.to_csv('traindata.csv')
test_data.to_csv('testdata.csv')

In [25]:
def age_group(age):
    if age == '0-24':
        return 0
    elif age == '25-32':
        return 1
    elif age == '33-45':
        return 2
    else:
        return 3

In [None]:
train_data.columns

### Target Encoding for age_group and Gender

In [26]:
train_data['age_group'] = train_data['age_group'].apply(lambda x : age_group(x) )
test_data['age_group'] = test_data['age_group'].apply(lambda x: age_group(x))
train_data['gender'] = train_data['gender'].apply(lambda x: 0 if x == 'M' else 1)
test_data['gender'] = test_data['gender'].apply(lambda x: 0 if x=='M' else 1)

In [28]:
age_train = train_data['age_group'].values
gender_train = train_data['gender'].values
age_test = test_data['age_group'].values
gender_test = test_data['age_group'].values
X_train = train_data.drop(['device_id','gender','train_test_flag','age_group'],axis = 1)
X_test = train_data.drop(['device_id','gender','train_test_flag','age_group'],axis = 1)

### Creating the machine learning model