In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('advertising.csv',na_values='?')
df.head(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0


### Ad Topic Line, City, Country
Since there are not many people belonging to same city, country and Ad Topic Line as shown below, so, this features have less predictive power. So we choose to investigate and drop it.

In [3]:
df.columns = ['Daily_Time_Spent_on_Site', 'Age', 'Area Income', 'Daily_Internet_Usage', 'Ad_Topic_Line', 'City', 
                 'Male', 'Country', 'Timestamp', 'Clicked_on_Ad']
numeric_cols = ['Daily_Time_Spent_on_Site', 'Age', 'Area Income', 'Daily_Internet_Usage', 'Male', 'Clicked_on_Ad' ]
class_cols = [ 'Ad_Topic_Line', 'City',  'Country' ]
df[class_cols].describe()

Unnamed: 0,Ad_Topic_Line,City,Country
count,1000,1000,1000
unique,1000,969,237
top,Ameliorated upward-trending definition,Williamsport,France
freq,1,3,9


In [4]:
pd.crosstab(index=df['Country'],columns='count').sort_values(['count'], ascending=False).head(10)

col_0,count
Country,Unnamed: 1_level_1
France,9
Czech Republic,9
Afghanistan,8
Australia,8
Turkey,8
South Africa,8
Senegal,8
Peru,8
Micronesia,8
Greece,8


In [5]:
pd.crosstab(df['Country'], df['Clicked_on_Ad']).sort_values(1,0, ascending = False).head(10)

Clicked_on_Ad,0,1
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,1,7
Turkey,1,7
Ethiopia,0,7
Liberia,2,6
South Africa,2,6
Liechtenstein,0,6
Senegal,3,5
Peru,3,5
Mayotte,1,5
Hungary,1,5


We can see Australia people, Turkey people and Ethiopia people are most likely to click on Ad.

In [6]:
df.drop(['Ad_Topic_Line','City','Country'],axis=1,inplace=True)

### Timestamp
Since the Timestamp is different in every data rows, we differentiate it by day, month, hour and day of week. If the day is Sunday, dayofweek = 6, if it is monday, dayOfWeek = 0.

In [7]:
# Extract datetime variables using timestamp column
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 
# Converting timestamp column into datatime object in order to extract new features
df['Month'] = df['Timestamp'].dt.month 
# Creates a new column called Month
df['Day'] = df['Timestamp'].dt.day     
# Creates a new column called Day
df['Hour'] = df['Timestamp'].dt.hour   
# Creates a new column called Hour
df["Weekday"] = df['Timestamp'].dt.dayofweek 
# Creates a new column called Weekday with sunday as 6 and monday as 0

In [8]:
df.head(6)

Unnamed: 0,Daily_Time_Spent_on_Site,Age,Area Income,Daily_Internet_Usage,Male,Timestamp,Clicked_on_Ad,Month,Day,Hour,Weekday
0,68.95,35,61833.9,256.09,0,2016-03-27 00:53:11,0,3,27,0,6
1,80.23,31,68441.85,193.77,1,2016-04-04 01:39:02,0,4,4,1,0
2,69.47,26,59785.94,236.5,0,2016-03-13 20:35:42,0,3,13,20,6
3,74.15,29,54806.18,245.89,1,2016-01-10 02:31:19,0,1,10,2,6
4,68.37,35,73889.99,225.58,0,2016-06-03 03:36:18,0,6,3,3,4
5,59.99,23,59761.56,226.74,1,2016-05-19 14:30:17,0,5,19,14,3


In [9]:
df = df.drop(['Timestamp'], axis=1) # deleting timestamp

### check for missing values

In [10]:
df.isnull().sum()

Daily_Time_Spent_on_Site    0
Age                         0
Area Income                 0
Daily_Internet_Usage        0
Male                        0
Clicked_on_Ad               0
Month                       0
Day                         0
Hour                        0
Weekday                     0
dtype: int64

no value is missed.

### split dataset into target data

In [11]:
y=df.Clicked_on_Ad
x=df.drop(['Clicked_on_Ad'],axis=1)

In [12]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(x)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### using K-Fold to train and test

In [13]:
import  sklearn.model_selection 
from sklearn.linear_model import LinearRegression

lr=LinearRegression()
# Create a k-fold object
nfold = 10
kf = sklearn.model_selection.KFold(n_splits=nfold,shuffle=True)

acc = []

# Loop over the folds
for Itr, Its in kf.split(x):
        
    #kf.split( ) returns Ind, which contains the indices to the training and testing data for each fold 
    xtr = x[Itr,:]
    ytr = y[Itr]
    xts = x[Its,:]
    yts = y[Its]
    
    # Loop over the model order

    lr.fit(xtr,ytr)
    yhat = lr.predict(xts)
    yhat[np.where(yhat>=0.5)] = 1
    yhat[np.where(yhat<0.5)] = 0    
    acc.append(np.mean(yts==yhat))


In [16]:
np.mean(acc)

0.9580000000000002