In [1]:
import numpy as np
import pandas as pd 

#importing dataset and libraries
import pandas as pd
import plotly.express as px

import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import warnings                       
warnings.filterwarnings("ignore")
%matplotlib inline  
sns.set_style('whitegrid')
sns.set_context('notebook')

## Exploring our dataset

In [2]:
#dataset
df = pd.read_csv('./data/advertising.csv')
df


Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.90,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.50,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,Fundamental modular algorithm,Duffystad,1,Lebanon,2016-02-11 21:49:00,1
996,51.30,45,67782.17,134.42,Grass-roots cohesive monitoring,New Darlene,1,Bosnia and Herzegovina,2016-04-22 02:07:01,1
997,51.63,51,42415.72,120.37,Expanded intangible solution,South Jessica,1,Mongolia,2016-02-01 17:24:57,1
998,55.55,19,41920.79,187.95,Proactive bandwidth-monitored policy,West Steven,0,Guatemala,2016-03-24 02:35:54,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.2+ KB


In [4]:
df.shape

(1000, 10)

In [5]:
df.duplicated().sum()

0

In [6]:
df.keys()

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'City', 'Male', 'Country',
       'Timestamp', 'Clicked on Ad'],
      dtype='object')

In [7]:
df.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Daily Time Spent on Site,1000.0,65.0002,15.853615,32.6,51.36,68.215,78.5475,91.43
Age,1000.0,36.009,8.785562,19.0,29.0,35.0,42.0,61.0
Area Income,1000.0,55000.00008,13414.634022,13996.5,47031.8025,57012.3,65470.635,79484.8
Daily Internet Usage,1000.0,180.0001,43.902339,104.78,138.83,183.13,218.7925,269.96
Male,1000.0,0.481,0.499889,0.0,0.0,0.0,1.0,1.0
Clicked on Ad,1000.0,0.5,0.50025,0.0,0.0,0.5,1.0,1.0


In [9]:
#Finding numnerical and categorical columns in the file

categoric = 0
numeric = 0
for columns in df.columns:
    if df[columns].dtype == 'object':
        categoric += 1
        print(f'{columns}:  Categorical')
    else:
        numeric += 1
        print(f'{columns}: Numeric')
          
print('Total categorical: 'f'{categoric}')
print('Total Numeric: 'f'{numeric}')


Daily Time Spent on Site: Numeric
Age: Numeric
Area Income: Numeric
Daily Internet Usage: Numeric
Ad Topic Line:  Categorical
City:  Categorical
Male: Numeric
Country:  Categorical
Timestamp:  Categorical
Clicked on Ad: Numeric
Total categorical: 4
Total Numeric: 6


In [10]:
obj_column = df.dtypes[df.dtypes == 'object'].index
for i in range(0, len(obj_column)) :
    print(obj_column[i])
    print(len(df[obj_column[i]].unique()))
    print()

Ad Topic Line
1000

City
969

Country
237

Timestamp
1000



In [11]:
qual_cols = set(df.select_dtypes(include = ['object']).columns)
print(f'Qualitative Variables: {qual_cols}')

#removing the timestamp as it is numeric or Qualitative variable
qual_cols = qual_cols - {'Timestamp'}
print(f'Qualitative Variables: {qual_cols}')



Qualitative Variables: {'Timestamp', 'City', 'Country', 'Ad Topic Line'}
Qualitative Variables: {'City', 'Country', 'Ad Topic Line'}


In [12]:
quant_cols = set(df.columns) - set(qual_cols)
print(f'Quantitative Variables: {quant_cols}')

Quantitative Variables: {'Timestamp', 'Age', 'Male', 'Clicked on Ad', 'Daily Internet Usage', 'Daily Time Spent on Site', 'Area Income'}


In [13]:
# Extract datetime variables using timestamp column
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 
# Converting timestamp column into datatime object in order to extract new features
df['Month'] = df['Timestamp'].dt.month 
# Creates a new column called Month
df['Day'] = df['Timestamp'].dt.day     
# Creates a new column called Day
df['Hour'] = df['Timestamp'].dt.hour   
# Creates a new column called Hour
df["Weekday"] = df['Timestamp'].dt.dayofweek 
# Creates a new column called Weekday with sunday as 6 and monday as 0
# Other way to create a weekday column
#df['weekday'] = df['Timestamp'].apply(lambda x: x.weekday()) # Monday 0 .. sunday 6
# Dropping timestamp column to avoid redundancy
df['Date'] = df['Timestamp'].apply(lambda t : t.date())
df = df.drop(['Timestamp'], axis=1) # deleting timestamp
df

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Clicked on Ad,Month,Day,Hour,Weekday,Date
0,68.95,35,61833.90,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,0,3,27,0,6,2016-03-27
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,0,4,4,1,0,2016-04-04
2,69.47,26,59785.94,236.50,Organic bottom-line service-desk,Davidton,0,San Marino,0,3,13,20,6,2016-03-13
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,0,1,10,2,6,2016-01-10
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,0,6,3,3,4,2016-06-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,72.97,30,71384.57,208.58,Fundamental modular algorithm,Duffystad,1,Lebanon,1,2,11,21,3,2016-02-11
996,51.30,45,67782.17,134.42,Grass-roots cohesive monitoring,New Darlene,1,Bosnia and Herzegovina,1,4,22,2,4,2016-04-22
997,51.63,51,42415.72,120.37,Expanded intangible solution,South Jessica,1,Mongolia,1,2,1,17,0,2016-02-01
998,55.55,19,41920.79,187.95,Proactive bandwidth-monitored policy,West Steven,0,Guatemala,0,3,24,2,3,2016-03-24


## visualizing our dataset

In [18]:
px.histogram(df,x='Daily Time Spent on Site', 
             color='Clicked on Ad',
             template="none",
             color_discrete_sequence= px.colors.sequential.Darkmint)

# 0 and 1, where 0 refers to a user who didn't click the advertisement, while one 
#refers to the scenario where a user clicks the ad.

In [19]:
# Daily Time Spent on Site by men and female 

px.histogram(df,x='Daily Time Spent on Site', 
             color='Male',
             template="none",
             color_discrete_sequence= px.colors.sequential.Darkmint)



In [20]:
#top city with daily time


city_dailytime=pd.DataFrame(df.groupby('City')['Daily Time Spent on Site'].mean().sort_values(ascending=False)[:20])
city_dailytime=city_dailytime.rename_axis(['City']).reset_index()
city_dailytime
px.bar(city_dailytime, x='City', y='Daily Time Spent on Site',template="none",color_discrete_sequence= px.colors.sequential.Redor)