In [None]:
# Last amended: 11th July, 2020
# My folder: /home/ashok/Documents/advertising
# VM: lubuntu_machinelearning_I
# Objective:
#           Work with mixed datatypes: numeric, categorical and text
#           Process different datatypes differently
#           Using pipelines and columntransformer

In [90]:
# 1.0 Call libraries
%reset -f
import numpy as np
import pandas as pd
# 1.1
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
# 1.2
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# 1.3
from sklearn.tree import DecisionTreeClassifier
import os

In [108]:
# 1.4 Display output not only of last command but all commands in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [91]:
# 2.0 Go to folder containing data file
#     and read file:
#os.chdir("D:\\data\\OneDrive\\Documents\\advertising")
#os.chdir("/home/ashok/Documents/advertising")
os.chdir("e:\\OneDrive\\Documents\\advertising")

# 2.1 Read file and while reading file,
#      convert 'Timestamp' to datetime time
ad = pd.read_csv("ad_mis.csv",
                  parse_dates = ['Timestamp']    # especial for date parsing
                  )

# 2.2 Look at data and make up your mind 
#     as to which in which columns
#     amendement has to be made
ad.head(3)

Unnamed: 0.1,Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,age_cat,areaIncome_cat
0,0,68.95,35.0,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,no,middle,high
1,1,,31.0,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,no,young,high
2,2,69.47,26.0,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,no,young,high


In [92]:
# 2.3 Check nulls
ad.isnull().sum()

Unnamed: 0                    0
Daily Time Spent on Site    100
Age                         100
Area Income                 100
Daily Internet Usage        100
Ad Topic Line                 0
City                          0
Male                          0
Country                       0
Timestamp                     0
Clicked on Ad                 0
age_cat                     100
areaIncome_cat              100
dtype: int64

In [152]:
# 2.4 Change column names
ad.columns = [ c.replace(" ","").lower() for c in ad.columns.values ]

In [94]:
# 2.5 Drop columns
ad.drop(columns = ['unnamed:0', 'city', 'country'], inplace = True)

In [96]:
# 2.6 Separate X and y
y = ad.pop("clickedonad")
y
X = ad

0       no
1       no
2       no
3       no
4       no
      ... 
995    yes
996    yes
997    yes
998     no
999    yes
Name: clickedonad, Length: 1000, dtype: object

In [98]:
# 3.0 Extract hour, weekday and month
ad['hourOfDay']   = ad['timestamp'].dt.hour
ad['weekday']     = ad['timestamp'].dt.weekday
ad['quarter']     = ad['timestamp'].dt.month 

# 3.1    
ad.head(2)    


Unnamed: 0,dailytimespentonsite,age,areaincome,dailyinternetusage,adtopicline,male,timestamp,age_cat,areaincome_cat,hourOfDay,weekday,quarter
0,68.95,35.0,61833.9,256.09,Cloned 5thgeneration orchestration,0,2016-03-27 00:53:11,middle,high,0,6,3
1,,31.0,68441.85,193.77,Monitored national standardization,1,2016-04-04 01:39:02,young,high,1,0,4


In [99]:
    
# 3.2 Cut hour to morning, evening, night etc
#     For example 0 to 6am is earlymorning
l = ["earlymorning", "morning", "afternoon", "evening", "night","latenight"]
ad["hour"] = pd.cut(ad['hourOfDay'], bins = [-1,6,12,17,20,22,24], labels = l)


# 3.3 Similarly for weekdays
#     Map weekday numbers to weekday names
#     We use Series.map() method
ad['weekday'] = ad['weekday'].map({
                                    0 : 'Monday',
                                    1 : 'Tuesday',
                                    2: 'Wednesday',
                                    3: 'Thursday',
                                    4: 'Friday',
                                    5: 'Saturday',
                                    6: 'Sunday'
                                    }
                                )
# 3.4
ad.head()


0    Sunday
1    Monday
2    Sunday
3    Sunday
4    Friday
Name: weekday, dtype: object

In [102]:
# 3.5 Drop more columns
ad.drop(columns = ['timestamp', 'hourOfDay'], inplace = True)

In [104]:
# 4.0 Column dtypes
X.dtypes

dailytimespentonsite     float64
age                      float64
areaincome               float64
dailyinternetusage       float64
adtopicline               object
male                       int64
age_cat                   object
areaincome_cat            object
weekday                   object
quarter                    int64
hour                    category
dtype: object

In [154]:
# 4.1 Separate numeric and categorical columns
num_cols = X.select_dtypes(include = ['float64','int64']).columns
num_cols 
num_cols = num_cols[:-1]   # 'quarter' is categorical
# 4.2num_cols

Index(['dailytimespentonsite', 'age', 'areaincome', 'dailyinternetusage',
       'male', 'quarter'],
      dtype='object')

In [120]:
# 4.2
cat_cols = X.select_dtypes(include = 'object').columns
cat_cols = cat_cols.tolist()
cat_cols.extend(['quarter'])
cat_cols

['adtopicline', 'age_cat', 'areaincome_cat', 'weekday', 'quarter']

In [121]:
# 4.3 How will we process data
num_cols_mean = ['dailytimespentonsite', 'age']
num_cols_median = ['areaincome', 'dailyinternetusage']
cat_cols_mf = ['age_cat']
cat_cols_constant = ['areaincome_cat']
text_col = ['adtopicline']

In [122]:
# 4.4 Create transformers to fill missing values
si_mean = SimpleImputer(strategy = 'mean')
si_median = SimpleImputer(strategy = 'median')
si_mf = SimpleImputer(strategy = 'most_frequent')
si_constant = SimpleImputer(strategy = "constant", fill_value= "missing")
tfidf = TfidfVectorizer(stop_words = 'english')

In [130]:
# 5.0 Create pipelines to fill missing values
#     and to perform further processing

pipe_mean = Pipeline(
                 [
                     ("si_mean",si_mean),
                     ("ss", StandardScaler())
                 ])

pipe_median = Pipeline(
                 [
                     ("si_median",si_median),
                     ("ss", StandardScaler())
                 ])

pipe_mf = Pipeline(
                    [
                        ("si_mf", si_mf),
                        ("ohe", OneHotEncoder())
                    ])

pipe_const = Pipeline(
                       [
                           ("si_constant", si_constant),
                           ("ohe", OneHotEncoder())
                       ])



In [138]:
# 5.1 Collect all pipes in column transformer

ct = ColumnTransformer([
                        ("mean", pipe_mean, num_cols_mean),
                        ('median', pipe_median, num_cols_median),
                        ('mf', pipe_mf, cat_cols_mf),
                        ('const', pipe_const, cat_cols_constant),
                        ("ohe", OneHotEncoder(), ["weekday", "hour"]),   # Non NaN here. Just OHE
                        ('tfidf', tfidf, "adtopicline") 
                        ],
                      remainder='passthrough',
                      sparse_threshold = 0     # Output is dense matric
                     )

In [139]:
# 5.2 Does columntransformer work?
h = ct.fit_transform(X)

In [140]:
# 5.3 It works
h.shape
h[:3]


(1000, 356)

array([[ 0.26271778, -0.11684714,  0.51873285, ...,  0.        ,
         0.        ,  3.        ],
       [ 0.        , -0.59485816,  1.03042177, ...,  0.        ,
         1.        ,  4.        ],
       [ 0.29720437, -1.19237194,  0.36014835, ...,  0.        ,
         0.        ,  3.        ]])

In [141]:
# 6.0 Create final pipeline
pipe = Pipeline([('ct', ct), ('dt', DecisionTreeClassifier())])

In [148]:
# 7.0 split data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, shuffle = True)

In [149]:
# 7.1 Train estimator
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0, transformer_weights=None,
                                   transformers=[('mean',
                                                  Pipeline(memory=None,
                                                           steps=[('si_mean',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                                                 verbose=0)),
        

In [150]:
# 7.2 Make prediction
y_pred = pipe.predict(X_test)

In [151]:
# 7.3 Get performanceabs
np.sum(y_pred == y_test)/len(y_test)    # 91.66%

0.9166666666666666

In [None]:
####################### I am done ##################