In [2]:
import sklearn as sk
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("train_data.csv")

In [4]:
df1 = df.drop(["Aboutus","Aboutus_Duration","Browser","ID","Operating_Systems"],axis = 1)
df1.dtypes

Item_page                 float64
Item_page_Duration        float64
Checkout_page             float64
Checkout_page_Duration    float64
Bounce_Rates              float64
Exit_Rates                float64
Page_Values               float64
Remarkable_Day            float64
Month                      object
Province                    int64
TrafficType                 int64
VisitorType                object
Weekend                    object
Income                      int64
dtype: object

In [5]:
#Encode
d = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
df1.Month = df1.Month.map(d)
df1

Unnamed: 0,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType,VisitorType,Weekend,Income
0,1.0,10.0,9.0,700.000000,0.000000,0.011111,29.621890,0.0,12.0,9,10,Returning_Visitor,no,1
1,2.0,15.0,10.0,894.666667,0.000000,0.022222,0.000000,0.0,5.0,4,2,Returning_Visitor,no,0
2,1.0,85.0,14.0,306.500000,0.000000,0.004444,0.000000,0.0,3.0,3,2,Returning_Visitor,no,0
3,5.0,175.1,26.0,615.559524,0.000000,0.002083,62.773672,0.0,9.0,1,3,New_Visitor,1,1
4,2.0,25.0,5.0,40.000000,0.066667,0.083333,0.000000,0.6,5.0,1,1,Returning_Visitor,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9938,0.0,0.0,8.0,376.333333,0.000000,0.028571,53.988000,0.0,5.0,2,4,Returning_Visitor,1,1
9939,1.0,0.0,22.0,671.866667,0.017391,0.020580,0.000000,0.0,11.0,7,10,Returning_Visitor,no,0
9940,0.0,0.0,15.0,872.833333,0.000000,0.026667,0.000000,0.0,3.0,2,3,Returning_Visitor,1,0
9941,2.0,327.5,7.0,570.500000,0.000000,0.028571,0.000000,0.0,11.0,2,2,Returning_Visitor,no,0


In [6]:
df1["VisitorType"] = df1["VisitorType"].astype('category')
df1["Weekend"] = df1["Weekend"].astype('category')
df1.dtypes

Item_page                  float64
Item_page_Duration         float64
Checkout_page              float64
Checkout_page_Duration     float64
Bounce_Rates               float64
Exit_Rates                 float64
Page_Values                float64
Remarkable_Day             float64
Month                      float64
Province                     int64
TrafficType                  int64
VisitorType               category
Weekend                   category
Income                       int64
dtype: object

In [7]:
categorical_data = df1.drop(['Item_page', 'Item_page_Duration', 'Checkout_page','Checkout_page_Duration','Bounce_Rates','Exit_Rates','Page_Values','Remarkable_Day','Month','Province','TrafficType','Income'], axis=1)
categorical_data

Unnamed: 0,VisitorType,Weekend
0,Returning_Visitor,no
1,Returning_Visitor,no
2,Returning_Visitor,no
3,New_Visitor,1
4,Returning_Visitor,no
...,...,...
9938,Returning_Visitor,1
9939,Returning_Visitor,no
9940,Returning_Visitor,1
9941,Returning_Visitor,no


In [8]:
numeric_data = df1.drop(['VisitorType','Weekend'],axis = 1)
numeric_data

Unnamed: 0,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType,Income
0,1.0,10.0,9.0,700.000000,0.000000,0.011111,29.621890,0.0,12.0,9,10,1
1,2.0,15.0,10.0,894.666667,0.000000,0.022222,0.000000,0.0,5.0,4,2,0
2,1.0,85.0,14.0,306.500000,0.000000,0.004444,0.000000,0.0,3.0,3,2,0
3,5.0,175.1,26.0,615.559524,0.000000,0.002083,62.773672,0.0,9.0,1,3,1
4,2.0,25.0,5.0,40.000000,0.066667,0.083333,0.000000,0.6,5.0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9938,0.0,0.0,8.0,376.333333,0.000000,0.028571,53.988000,0.0,5.0,2,4,1
9939,1.0,0.0,22.0,671.866667,0.017391,0.020580,0.000000,0.0,11.0,7,10,0
9940,0.0,0.0,15.0,872.833333,0.000000,0.026667,0.000000,0.0,3.0,2,3,0
9941,2.0,327.5,7.0,570.500000,0.000000,0.028571,0.000000,0.0,11.0,2,2,0


In [9]:
categorical_data["VisitorType"] = categorical_data["VisitorType"].cat.codes
categorical_data["Weekend"] = categorical_data["Weekend"].cat.codes

In [10]:
dataset_new=categorical_data.join(numeric_data)
dataset_new.head()

Unnamed: 0,VisitorType,Weekend,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType,Income
0,2,1,1.0,10.0,9.0,700.0,0.0,0.011111,29.62189,0.0,12.0,9,10,1
1,2,1,2.0,15.0,10.0,894.666667,0.0,0.022222,0.0,0.0,5.0,4,2,0
2,2,1,1.0,85.0,14.0,306.5,0.0,0.004444,0.0,0.0,3.0,3,2,0
3,0,0,5.0,175.1,26.0,615.559524,0.0,0.002083,62.773672,0.0,9.0,1,3,1
4,2,1,2.0,25.0,5.0,40.0,0.066667,0.083333,0.0,0.6,5.0,1,1,0


In [11]:
#column_means = df.mean()
#dataSet_final = dataset_new.fillna(column_means)

In [12]:
# dataSet_final.dtypes
dataSet_final = dataset_new.dropna(axis = 0)
dataSet_final.isnull().sum()

VisitorType               0
Weekend                   0
Item_page                 0
Item_page_Duration        0
Checkout_page             0
Checkout_page_Duration    0
Bounce_Rates              0
Exit_Rates                0
Page_Values               0
Remarkable_Day            0
Month                     0
Province                  0
TrafficType               0
Income                    0
dtype: int64

In [13]:
# from sklearn.preprocessing import MinMaxScaler
# import numpy as np
  
# # copy the data
# df_sklearn = dataSet_final.copy()
  
# # apply normalization techniques
# column1 = 'Item_page'
# column2 = 'Item_page_Duration'
# column3 = 'Checkout_page'
# column4 = 'Checkout_page_Duration'
# column5 = 'Page_Values'

# df_sklearn[column1] = MinMaxScaler().fit_transform(np.array(df_sklearn[column1]).reshape(-1,1))
# df_sklearn[column2] = MinMaxScaler().fit_transform(np.array(df_sklearn[column2]).reshape(-1,1))
# df_sklearn[column3] = MinMaxScaler().fit_transform(np.array(df_sklearn[column3]).reshape(-1,1))
# df_sklearn[column4] = MinMaxScaler().fit_transform(np.array(df_sklearn[column4]).reshape(-1,1))
# df_sklearn[column5] = MinMaxScaler().fit_transform(np.array(df_sklearn[column5]).reshape(-1,1))
# # view normalized data  
# display(df_sklearn)


In [14]:
# from sklearn.preprocessing import MinMaxScaler
# minmax_scaler = MinMaxScaler()
# names=[""]
# d = minmax_scaler.fit_transform(dataSet_final)
# scaled_df = pd.DataFrame(d, columns=names)
# scaled_df.head()

In [15]:
data = dataSet_final.iloc[:,0:13]
data

Unnamed: 0,VisitorType,Weekend,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType
0,2,1,1.0,10.0,9.0,700.000000,0.000000,0.011111,29.621890,0.0,12.0,9,10
1,2,1,2.0,15.0,10.0,894.666667,0.000000,0.022222,0.000000,0.0,5.0,4,2
2,2,1,1.0,85.0,14.0,306.500000,0.000000,0.004444,0.000000,0.0,3.0,3,2
3,0,0,5.0,175.1,26.0,615.559524,0.000000,0.002083,62.773672,0.0,9.0,1,3
4,2,1,2.0,25.0,5.0,40.000000,0.066667,0.083333,0.000000,0.6,5.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9938,2,0,0.0,0.0,8.0,376.333333,0.000000,0.028571,53.988000,0.0,5.0,2,4
9939,2,1,1.0,0.0,22.0,671.866667,0.017391,0.020580,0.000000,0.0,11.0,7,10
9940,2,0,0.0,0.0,15.0,872.833333,0.000000,0.026667,0.000000,0.0,3.0,2,3
9941,2,1,2.0,327.5,7.0,570.500000,0.000000,0.028571,0.000000,0.0,11.0,2,2


In [16]:
target = dataSet_final.iloc[:,13]
target

NameError: name 'scaled_df' is not defined

In [None]:
from sklearn.model_selection import train_test_split

In [17]:
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.3)

NameError: name 'train_test_split' is not defined

In [18]:
from sklearn.neighbors import KNeighborsClassifier

model=KNeighborsClassifier() #load KNN algorithm into model

In [19]:
model.fit(train_data,train_target)

NameError: name 'train_data' is not defined

In [20]:
predicted_target=model.predict(test_data) #getting predictions from the model
print(predicted_target)

NameError: name 'test_data' is not defined

In [253]:
from sklearn.metrics import accuracy_score

acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)

Accuracy: 0.8556701030927835


In [254]:
##########################################################
# Testing Set
##########################################################

df_test = pd.read_csv("test_data.csv")

In [255]:
ids=df_test['ID'].values


In [256]:
df1_test = df_test.drop(["ID","Aboutus","Aboutus_Duration","Browser","Operating_Systems"],axis = 1)
df1_test.dtypes

Item_page                   int64
Item_page_Duration        float64
Checkout_page               int64
Checkout_page_Duration    float64
Bounce_Rates              float64
Exit_Rates                float64
Page_Values               float64
Remarkable_Day            float64
Month                      object
Province                    int64
TrafficType                 int64
VisitorType                object
Weekend                    object
dtype: object

In [257]:
d1 = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
df1_test.Month = df1_test.Month.map(d1)
df1_test

Unnamed: 0,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType,VisitorType,Weekend
0,12,569.825000,127,6065.016218,0.006716,0.009265,0.152167,0.0,8.0,2,4,Returning_Visitor,no
1,0,0.000000,25,770.028571,0.011200,0.034933,0.000000,0.0,3.0,8,10,Returning_Visitor,no
2,7,38.600000,70,1087.700000,0.000000,0.004861,0.000000,0.0,10.0,4,5,Returning_Visitor,no
3,4,105.266667,35,655.684762,0.000000,0.005128,0.000000,0.0,11.0,7,2,New_Visitor,1
4,0,0.000000,3,63.000000,0.000000,0.066667,0.000000,0.8,5.0,9,1,Returning_Visitor,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0.000000,2,4.000000,0.000000,0.100000,0.000000,0.8,5.0,4,1,Returning_Visitor,no
996,1,43.000000,36,2362.500000,0.000000,0.016190,0.000000,0.6,5.0,1,4,Returning_Visitor,no
997,0,0.000000,25,1017.966667,0.008333,0.017361,0.000000,0.0,8.0,4,1,Returning_Visitor,1
998,0,0.000000,18,587.733333,0.000000,0.022222,32.523808,0.0,11.0,7,4,Returning_Visitor,1


In [258]:
df1_test["VisitorType"] = df1_test["VisitorType"].astype('category')
df1_test["Weekend"] = df1_test["Weekend"].astype('category')
df1_test.dtypes

Item_page                    int64
Item_page_Duration         float64
Checkout_page                int64
Checkout_page_Duration     float64
Bounce_Rates               float64
Exit_Rates                 float64
Page_Values                float64
Remarkable_Day             float64
Month                      float64
Province                     int64
TrafficType                  int64
VisitorType               category
Weekend                   category
dtype: object

In [259]:
categorical_data_test = df1_test.drop(['Item_page', 'Item_page_Duration', 'Checkout_page','Checkout_page_Duration','Bounce_Rates','Exit_Rates','Page_Values','Remarkable_Day','Month','Province','TrafficType'], axis=1)
categorical_data_test

Unnamed: 0,VisitorType,Weekend
0,Returning_Visitor,no
1,Returning_Visitor,no
2,Returning_Visitor,no
3,New_Visitor,1
4,Returning_Visitor,no
...,...,...
995,Returning_Visitor,no
996,Returning_Visitor,no
997,Returning_Visitor,1
998,Returning_Visitor,1


In [260]:
numeric_data_test = df1_test.drop(['VisitorType','Weekend'],axis = 1)
numeric_data_test

Unnamed: 0,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType
0,12,569.825000,127,6065.016218,0.006716,0.009265,0.152167,0.0,8.0,2,4
1,0,0.000000,25,770.028571,0.011200,0.034933,0.000000,0.0,3.0,8,10
2,7,38.600000,70,1087.700000,0.000000,0.004861,0.000000,0.0,10.0,4,5
3,4,105.266667,35,655.684762,0.000000,0.005128,0.000000,0.0,11.0,7,2
4,0,0.000000,3,63.000000,0.000000,0.066667,0.000000,0.8,5.0,9,1
...,...,...,...,...,...,...,...,...,...,...,...
995,0,0.000000,2,4.000000,0.000000,0.100000,0.000000,0.8,5.0,4,1
996,1,43.000000,36,2362.500000,0.000000,0.016190,0.000000,0.6,5.0,1,4
997,0,0.000000,25,1017.966667,0.008333,0.017361,0.000000,0.0,8.0,4,1
998,0,0.000000,18,587.733333,0.000000,0.022222,32.523808,0.0,11.0,7,4


In [261]:
categorical_data_test["VisitorType"] = categorical_data_test["VisitorType"].cat.codes
categorical_data_test["Weekend"] = categorical_data_test["Weekend"].cat.codes


In [262]:
dataset_new_test=categorical_data_test.join(numeric_data_test)
dataset_new_test

Unnamed: 0,VisitorType,Weekend,Item_page,Item_page_Duration,Checkout_page,Checkout_page_Duration,Bounce_Rates,Exit_Rates,Page_Values,Remarkable_Day,Month,Province,TrafficType
0,2,1,12,569.825000,127,6065.016218,0.006716,0.009265,0.152167,0.0,8.0,2,4
1,2,1,0,0.000000,25,770.028571,0.011200,0.034933,0.000000,0.0,3.0,8,10
2,2,1,7,38.600000,70,1087.700000,0.000000,0.004861,0.000000,0.0,10.0,4,5
3,0,0,4,105.266667,35,655.684762,0.000000,0.005128,0.000000,0.0,11.0,7,2
4,2,1,0,0.000000,3,63.000000,0.000000,0.066667,0.000000,0.8,5.0,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2,1,0,0.000000,2,4.000000,0.000000,0.100000,0.000000,0.8,5.0,4,1
996,2,1,1,43.000000,36,2362.500000,0.000000,0.016190,0.000000,0.6,5.0,1,4
997,2,0,0,0.000000,25,1017.966667,0.008333,0.017361,0.000000,0.0,8.0,4,1
998,2,0,0,0.000000,18,587.733333,0.000000,0.022222,32.523808,0.0,11.0,7,4


In [263]:
dataset_new_test.isnull().sum()

VisitorType                0
Weekend                    0
Item_page                  0
Item_page_Duration         0
Checkout_page              0
Checkout_page_Duration     0
Bounce_Rates               0
Exit_Rates                 0
Page_Values                0
Remarkable_Day             0
Month                     25
Province                   0
TrafficType                0
dtype: int64

In [1]:
dataset_new_test['Month'] = dataset_new_test['Month'].fillna(random.randint(1, 12))
dataset_new_test.isnull().sum()

NameError: name 'dataset_new_test' is not defined

In [265]:
# from sklearn.preprocessing import MinMaxScaler
# import numpy as np
  
# # copy the data
# df_sklearn_test = dataset_new_test.copy()
  
# # apply normalization techniques
# column1 = 'Item_page'
# column2 = 'Item_page_Duration'
# column3 = 'Checkout_page'
# column4 = 'Checkout_page_Duration'
# column5 = 'Page_Values'

# df_sklearn_test[column1] = MinMaxScaler().fit_transform(np.array(df_sklearn_test[column1]).reshape(-1,1))
# df_sklearn_test[column2] = MinMaxScaler().fit_transform(np.array(df_sklearn_test[column2]).reshape(-1,1))
# df_sklearn_test[column3] = MinMaxScaler().fit_transform(np.array(df_sklearn_test[column3]).reshape(-1,1))
# df_sklearn_test[column4] = MinMaxScaler().fit_transform(np.array(df_sklearn_test[column4]).reshape(-1,1))
# df_sklearn_test[column5] = MinMaxScaler().fit_transform(np.array(df_sklearn_test[column5]).reshape(-1,1))
# # view normalized data  
# display(df_sklearn_test)

In [266]:
predicted_target=model.predict(dataset_new_test) #getting predictions from the model
print(predicted_target)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [267]:
output = pd.DataFrame({'ID': ids, 'Income': predicted_target})

In [268]:
output

Unnamed: 0,ID,Income
0,1,0.0
1,2,0.0
2,3,0.0
3,4,0.0
4,5,0.0
...,...,...
995,996,0.0
996,997,0.0
997,998,0.0
998,999,0.0


In [269]:
output.to_csv("HellRaisers1.csv")