In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

In [2]:
# read train data
train_df = pd.read_csv("train.csv")

#read test data
test_df = pd.read_csv("test.csv")

In [3]:
# first 5 rows of data
train_df.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click
0,140690,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0
1,333291,2017-07-02 00:00,243253,C,105960,11085,5,,8.0,Female,2.0,2.0,,0,0
2,129781,2017-07-02 00:00,243253,C,359520,13787,4,,8.0,Female,2.0,2.0,,0,0
3,464848,2017-07-02 00:00,1097446,I,359520,13787,3,,3.0,Male,3.0,3.0,2.0,1,0
4,90569,2017-07-02 00:01,663656,C,405490,60305,3,,2.0,Male,2.0,3.0,2.0,1,0


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463291 entries, 0 to 463290
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   session_id              463291 non-null  int64  
 1   DateTime                463291 non-null  object 
 2   user_id                 463291 non-null  int64  
 3   product                 463291 non-null  object 
 4   campaign_id             463291 non-null  int64  
 5   webpage_id              463291 non-null  int64  
 6   product_category_1      463291 non-null  int64  
 7   product_category_2      97437 non-null   float64
 8   user_group_id           445048 non-null  float64
 9   gender                  445048 non-null  object 
 10  age_level               445048 non-null  float64
 11  user_depth              445048 non-null  float64
 12  city_development_index  338162 non-null  float64
 13  var_1                   463291 non-null  int64  
 14  is_click            

In [5]:
# check if data has null values
print(train_df.isnull().sum())

session_id                     0
DateTime                       0
user_id                        0
product                        0
campaign_id                    0
webpage_id                     0
product_category_1             0
product_category_2        365854
user_group_id              18243
gender                     18243
age_level                  18243
user_depth                 18243
city_development_index    125129
var_1                          0
is_click                       0
dtype: int64


# First Prediction

In [6]:
# goal is to predict "is_click"
y_train = train_df['is_click']

In [7]:
# starter prediction with all the columns that are float64/int64 types and have no NA values
x_train = train_df.iloc[:, 0:7]
x_train['var_1'] = train_df['var_1']
x_train = x_train.drop(['DateTime', 'product'], axis = 1)

x_test = test_df.iloc[:, 0:7]
x_test['var_1'] = test_df['var_1']
x_test = x_test.drop(['DateTime', 'product'], axis = 1)

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

Lr = LogisticRegression()
Lr.fit(x_train, y_train)
y_pred = Lr.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [9]:
for i in y_pred:
    if i == 1:
        print(i)

In [10]:
y_pred_proba1 = Lr.predict_proba(x_test)
print(y_pred_proba1)

[[0.97014724 0.02985276]
 [0.75905282 0.24094718]
 [0.77516375 0.22483625]
 ...
 [0.90918847 0.09081153]
 [0.9522114  0.0477886 ]
 [0.95499532 0.04500468]]


# Second Prediction

In [11]:
# now only include data that has non NA gender inputs
newtrain_df = train_df[train_df['gender'].notna()]
newtest_df = test_df[test_df['gender'].notna()]

In [12]:
# seems like including non NA gender inputs cleared many other variables with NA inputs initially
newtrain_df.isna().sum()

session_id                     0
DateTime                       0
user_id                        0
product                        0
campaign_id                    0
webpage_id                     0
product_category_1             0
product_category_2        351379
user_group_id                  0
gender                         0
age_level                      0
user_depth                     0
city_development_index    106886
var_1                          0
is_click                       0
dtype: int64

In [13]:
# exclude variables that still have NA inputs

x_train = newtrain_df.drop(['DateTime', 'product', 'product_category_2', 'city_development_index', 'is_click'], axis = 1)
x_test = newtest_df.drop(['DateTime', 'product', 'product_category_2', 'city_development_index'], axis = 1)

In [14]:
# encoding dictionary
cleanUpGender = {
    "gender" : {"Female" : 0, "Male" : 1}
}

In [15]:
x_train = x_train.replace(cleanUpGender)
x_test = x_test.replace(cleanUpGender)

In [16]:
y_train = newtrain_df['is_click']

In [17]:
Lr.fit(x_train, y_train)
y_pred = Lr.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
for i in y_pred:
    if i == 1:
        print(i)

In [19]:
y_pred_proba2 = Lr.predict_proba(x_test)
print(y_pred_proba2)

[[0.97034706 0.02965294]
 [0.92723876 0.07276124]
 [0.952961   0.047039  ]
 ...
 [0.90888603 0.09111397]
 [0.95232817 0.04767183]
 [0.95521607 0.04478393]]


# Third Prediction

In [20]:
newtrain_df.isna().sum()

session_id                     0
DateTime                       0
user_id                        0
product                        0
campaign_id                    0
webpage_id                     0
product_category_1             0
product_category_2        351379
user_group_id                  0
gender                         0
age_level                      0
user_depth                     0
city_development_index    106886
var_1                          0
is_click                       0
dtype: int64

In [21]:
newtrain_df2 = newtrain_df[newtrain_df['product_category_2'].notna()]
newtrain_df3 = newtrain_df2[newtrain_df2['city_development_index'].notna()]

In [22]:
# data set has no more NA values
newtrain_df3.isna().sum()

session_id                0
DateTime                  0
user_id                   0
product                   0
campaign_id               0
webpage_id                0
product_category_1        0
product_category_2        0
user_group_id             0
gender                    0
age_level                 0
user_depth                0
city_development_index    0
var_1                     0
is_click                  0
dtype: int64

In [23]:
newtrain_df3 = newtrain_df3.drop(['DateTime'], axis = 1)

In [24]:
# encoding product and gender variable using label encoding

newtrain_df3['product'] = newtrain_df3['product'].astype('category')
newtrain_df3['gender'] = newtrain_df3['gender'].astype('category')

newtrain_df3['product'] = newtrain_df3['product'].cat.codes
newtrain_df3['gender'] = newtrain_df3['gender'].cat.codes

In [25]:
# prepare our training data sets
x_train = newtrain_df3.drop(['is_click'], axis = 1)
y_train = newtrain_df3['is_click']

In [26]:
newtest_df.isna().sum()

session_id                    0
DateTime                      0
user_id                       0
product                       0
campaign_id                   0
webpage_id                    0
product_category_1            0
product_category_2        72568
user_group_id                 0
gender                        0
age_level                     0
user_depth                    0
city_development_index    28925
var_1                         0
dtype: int64

In [27]:
newtest_df2 = newtest_df[newtest_df['product_category_2'].notna()]
newtest_df3 = newtest_df2[newtest_df2['city_development_index'].notna()]

In [28]:
# test set has no more NA values
newtest_df3.isna().sum()

session_id                0
DateTime                  0
user_id                   0
product                   0
campaign_id               0
webpage_id                0
product_category_1        0
product_category_2        0
user_group_id             0
gender                    0
age_level                 0
user_depth                0
city_development_index    0
var_1                     0
dtype: int64

In [29]:
# prepare our test set
x_test = newtest_df3.drop(['DateTime'], axis = 1)
x_test['product'] = x_test['product'].astype('category').cat.codes
x_test['gender'] = x_test['gender'].astype('category').cat.codes

In [30]:
Lr.fit(x_train, y_train)
y_pred = Lr.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
for i in y_pred:
    if i == 1:
        print(i)

In [32]:
y_pred_proba3 = Lr.predict_proba(x_test)
print(y_pred_proba3)

[[0.94885373 0.05114627]
 [0.94903801 0.05096199]
 [0.96673402 0.03326598]
 ...
 [0.86168118 0.13831882]
 [0.94931068 0.05068932]
 [0.94107684 0.05892316]]


In [34]:
print("//First Prediction Probability//")
print(y_pred_proba1)

print("//Second Prediction Probability//")
print(y_pred_proba2)

print("//Third Prediction Probability//")
print(y_pred_proba3)

//First Prediction Probability//
[[0.97014724 0.02985276]
 [0.75905282 0.24094718]
 [0.77516375 0.22483625]
 ...
 [0.90918847 0.09081153]
 [0.9522114  0.0477886 ]
 [0.95499532 0.04500468]]
//Second Prediction Probability//
[[0.97034706 0.02965294]
 [0.92723876 0.07276124]
 [0.952961   0.047039  ]
 ...
 [0.90888603 0.09111397]
 [0.95232817 0.04767183]
 [0.95521607 0.04478393]]
//Third Prediction Probability//
[[0.94885373 0.05114627]
 [0.94903801 0.05096199]
 [0.96673402 0.03326598]
 ...
 [0.86168118 0.13831882]
 [0.94931068 0.05068932]
 [0.94107684 0.05892316]]
