In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,ShuffleSplit,StratifiedShuffleSplit,KFold,cross_validate
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV

In [3]:
df= pd.read_csv('advertising.csv')

In [4]:
df.shape

(1000, 10)

In [5]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,27-03-2016 00:53,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,04-04-2016 01:39,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,13-03-2016 20:35,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,10-01-2016 02:31,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,03-06-2016 03:36,0


In [7]:
df['Clicked on Ad'].unique()

array([0, 1])

In [8]:
#### missing values
df.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [9]:
### Duplicates
df.duplicated().sum()

np.int64(0)

In [10]:
df.columns

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'City', 'Male', 'Country',
       'Timestamp', 'Clicked on Ad'],
      dtype='object')

In [12]:
features= df[['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage','Male']]
features.head()
target=df[['Clicked on Ad']]

#### Train_test_split (Holdout Method)

In [14]:
x_train,x_test,y_train,y_test= train_test_split(features,target,train_size=0.75,random_state=100,stratify=target)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape).

(750, 5)
(250, 5)
(750, 1)
(250, 1)


In [39]:
from sklearn.metrics import accuracy_score

In [40]:
log_model=LogisticRegression()
log_model.fit(x_train,y_train)
y_pred_train= log_model.predict(x_train)
accuracy_score(y_train,y_pred_train)

0.9213333333333333

In [42]:
y_pred_test= log_model.predict(x_test)
accuracy_score(y_test,y_pred_test)

0.928

In [46]:
log_cv=LogisticRegressionCV(cv=shuffle_split)
log_cv.fit(x_train,y_train)
y_pred_train= log_cv.predict(x_train)
accuracy_score(y_train,y_pred_train)

0.9693333333333334

In [47]:
y_pred_test= log_cv.predict(x_test)
accuracy_score(y_test,y_pred_test)

0.972

#### Shuffle Split

In [16]:
shuffle_split=ShuffleSplit(n_splits=10,test_size=0.3,random_state=50)

In [18]:
for train_split, test_split in shuffle_split.split(features,target):
    print(train_split,test_split)

[788 579 554 882  44   8 494 935 914 111 374 513 814 216 627 553 739 940
 180 589 246 897 408 829 761 399 108 770 855  87 345 323 168 758 110 638
  94 945 828 902 503 789 259  23 600  85 532 628 114 608 319 611 347 542
  34 120 913 537 714 473 870 172 999 337 100 442 353 653 496 195 746 227
 468 201 287 500 718 821 470 805  52 445  54 667  81 737 885 568 766 942
 708 454 546 137 179 961 645 467 273 143 151 624 849 354 485 175 534 691
 317  24 670  80 294 682 324 145 853 971 641 927 103 329 593 249 911 793
 149 637  65 212 282 350 390 150 214 890   1 424 973 734 977 803 295 767
  14 204 182 181  89 830 817 148 515 389 361 968 979 510 141 975 631 896
 954 303 867 135  97 123 376 842  67  83 664 281 660 919 884 208 920 238
 840 387 570 444 476 552 223 895 800 652 738 602 705 969 904 957 298 170
 610 102 493 322 372 735 680 692 505 557 781 946 987 268 236 871 397 567
 930 702 138 438 639 432 950 422 769 136 865  15 290  88 307   6 856 607
 980 512 461 892   3 549 247 555 107 272 377 140 20

In [37]:
import warnings
warnings.filterwarnings('ignore')
cross=cross_validate(LogisticRegression(),features,target,scoring=['accuracy'],cv=shuffle_split,return_train_score=True)

In [38]:
train_accuracy= cross['train_accuracy'].mean().round(2)
test_accuracy= cross['test_accuracy'].mean().round(2)
print('Train_accuracy:',train_accuracy, ',','Test_accuracy:',test_accuracy)

Train_accuracy: 0.93 , Test_accuracy: 0.93


#### Stratified shuffle split

In [49]:
strat_shuffle=StratifiedShuffleSplit(n_splits=5,test_size=0.3,random_state=50)

In [50]:
cross=cross_validate(LogisticRegression(),features,target,scoring=['accuracy'],cv=strat_shuffle,return_train_score=True)

In [51]:
train_accuracy= cross['train_accuracy'].mean().round(2)
test_accuracy= cross['test_accuracy'].mean().round(2)
print('Train_accuracy:',train_accuracy, ',','Test_accuracy:',test_accuracy)

Train_accuracy: 0.94 , Test_accuracy: 0.93


#### KFold