<a href="https://colab.research.google.com/github/jyothirmai318/Data-Analysis/blob/main/Data_Analysis_Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**ICE-9** Cross Validation

###Tutorial

####Performing EDA on the dataset

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, LeaveOneOut,\
RepeatedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
workclass,1836
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,1843
relationship,0
race,0
sex,0


In [None]:
df.fillna('missing', inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0


In [None]:
X = df.drop(' income', axis=1)
y = df[' income']

In [None]:
X.shape

(32561, 14)

In [None]:
y.value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24720
>50K,7841


###1. KFold Cross Validation

In [None]:
kf = KFold(n_splits=5)

In [None]:
32561/5

6512.2

In [None]:
6512*4

26048

In [None]:
i = 1
for train_set, test_set in kf.split(X=X):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print("-------------------------")
    i += 1

iteration  1
[ 6513  6514  6515 ... 32558 32559 32560]  having : 26048
[   0    1    2 ... 6510 6511 6512]  having : 6513
-------------------------
iteration  2
[    0     1     2 ... 32558 32559 32560]  having : 26049
[ 6513  6514  6515 ... 13022 13023 13024]  having : 6512
-------------------------
iteration  3
[    0     1     2 ... 32558 32559 32560]  having : 26049
[13025 13026 13027 ... 19534 19535 19536]  having : 6512
-------------------------
iteration  4
[    0     1     2 ... 32558 32559 32560]  having : 26049
[19537 19538 19539 ... 26046 26047 26048]  having : 6512
-------------------------
iteration  5
[    0     1     2 ... 26046 26047 26048]  having : 26049
[26049 26050 26051 ... 32558 32559 32560]  having : 6512
-------------------------


In [None]:
num_cols = X.select_dtypes(include=np.number).columns
num_cols

Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

In [None]:
cat_cols = X.select_dtypes(exclude=np.number).columns
cat_cols

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object')

In [None]:
ct = ColumnTransformer([
    ('rob', RobustScaler(), num_cols),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])

In [None]:
pipe = Pipeline([
    ('ct_step', ct),
    ('model', RandomForestClassifier(n_estimators=10, random_state=0))
])

In [None]:
X.loc[6513:].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
6513,29,Private,280344,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
6514,45,Private,202496,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,37,United-States
6515,61,Self-emp-inc,134768,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
6516,40,Private,175686,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
6517,24,Private,194748,HS-grad,9,Never-married,Transport-moving,Not-in-family,White,Female,0,0,49,United-States


In [None]:
scores = []
i = 1
for train_set, test_set in kf.split(X):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [None]:
np.array(scores)

array([0.84784278, 0.84520885, 0.84613022, 0.84858722, 0.85165848])

In [None]:
np.array(scores).mean()

0.8478855085142512

In [None]:
np.array(scores).std()

0.0022349531977626388

###2. Stratified KFold Cross Validation

In [None]:
y.value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,24720
>50K,7841


In [None]:
7841/5

1568.2

In [None]:
24720/5

4944.0

In [None]:
4944*4

19776

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
scores_skf = []
i = 1
for train_set, test_set in skf.split(X, y):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores_skf.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [None]:
scores_skf

[0.8473821587594043,
 0.8432125307125307,
 0.8421375921375921,
 0.8425982800982801,
 0.8536547911547911]

In [None]:
i = 1
for train_set, test_set in skf.split(X=X, y=y):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print()
    print("y train counts: \n", y[train_set].value_counts())
    print("y test counts: \n", y[test_set].value_counts())
    print("-------------------------")
    i += 1

iteration  1
[ 6499  6500  6512 ... 32558 32559 32560]  having : 26048
[   0    1    2 ... 6514 6515 6516]  having : 6513

y train counts: 
  income
<=50K    19776
>50K      6272
Name: count, dtype: int64
y test counts: 
  income
<=50K    4944
>50K     1569
Name: count, dtype: int64
-------------------------
iteration  2
[    0     1     2 ... 32558 32559 32560]  having : 26049
[ 6499  6500  6512 ... 13121 13123 13125]  having : 6512

y train counts: 
  income
<=50K    19776
>50K      6273
Name: count, dtype: int64
y test counts: 
  income
<=50K    4944
>50K     1568
Name: count, dtype: int64
-------------------------
iteration  3
[    0     1     2 ... 32558 32559 32560]  having : 26049
[12997 12999 13000 ... 19727 19729 19733]  having : 6512

y train counts: 
  income
<=50K    19776
>50K      6273
Name: count, dtype: int64
y test counts: 
  income
<=50K    4944
>50K     1568
Name: count, dtype: int64
-------------------------
iteration  4
[    0     1     2 ... 32558 32559 32560]  ha

In [None]:
result_kf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=5)

In [None]:
result_kf

array([0.84738216, 0.84321253, 0.84213759, 0.84259828, 0.85365479])

###3. Leave one out Cross Validation

In [None]:
start = time.time()
result_kf10 = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=KFold(n_splits=10))
result_kf10
print("time taken: ", time.time()-start)

time taken:  7.767460584640503


In [None]:
result_kf10

array([0.83880872, 0.85165848, 0.84981572, 0.84367322, 0.85135135,
       0.84613022, 0.84520885, 0.84797297, 0.8544226 , 0.84459459])

In [None]:
start = time.time()
result_loocv = cross_val_score(estimator=pipe, X=X.head(100), y=y.head(100),
                               scoring='accuracy', cv=LeaveOneOut())
print("time taken: ", time.time()-start)

time taken:  3.749863624572754


In [None]:
32531/100

325.31

In [None]:
325*8

2600

In [None]:
325*8/60

43.333333333333336

In [None]:
result_loocv

array([1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.])

In [None]:
result_loocv.mean()

0.8

###4. Repeated KFold Cross Validation

In [None]:
start = time.time()
result_rkf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy',
                              cv=RepeatedKFold(n_splits=5, n_repeats=5))
result_rkf
print("time taken: ", time.time()-start)

time taken:  15.593996047973633


In [None]:
result_rkf

array([0.84707508, 0.85227273, 0.85058354, 0.83983415, 0.85227273,
       0.84584677, 0.85257985, 0.83891278, 0.84781941, 0.85242629,
       0.84277599, 0.84689803, 0.84889435, 0.84597666, 0.85012285,
       0.85275603, 0.84075553, 0.85257985, 0.85227273, 0.83968059,
       0.84814985, 0.84628378, 0.85135135, 0.84735872, 0.8495086 ])

###5. Performing Cross validation on a training set, and then verifying our results on a seperate testset to generalize our results.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
start = time.time()
result_tts = cross_val_score(estimator=pipe, X=X_train, y=y_train,
                              scoring='accuracy', cv=KFold(n_splits=5))
print("time taken: ", time.time()-start)

time taken:  3.413156270980835


In [None]:
result_tts

array([0.85547025, 0.84702495, 0.84184261, 0.84661163, 0.84968324])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

0.8446184553968985

In [None]:
cross_val_score(estimator=pipe, X=X_train, y=y_train,
                scoring='accuracy', cv=KFold(n_splits=5))

array([0.85547025, 0.84702495, 0.84184261, 0.84661163, 0.84968324])

In [None]:
from sklearn.metrics import get_scorer_names

scoring_metrics = sorted(get_scorer_names())
print(scoring_metrics)


['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'd2_absolute_error_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'neg_root_mean_squared_log_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'r

###Write your observations of the above tutorial here.**(5%)**

###**Task-1(40%)**
#####Perform cross validation on the tutorial given using the dataset attached.After you are successful in executing and fixing the issues with the current version use dataset of your choice and execute it again.

In [None]:
df = pd.read_csv('customer.csv', na_values='NaN')
df.head()

Unnamed: 0,Age,Customer_Type,Annual_Income,Preferred_Category,Purchase_Frequency,Marital_Status,Occupation,Household_Role,Ethnicity,Gender,Online_Spend,InStore_Spend,Weekly_Visits,Country,Loyalty_Status
0,62,VIP,77600,Toys,15,Divorced,Business Owner,Self,Pacific Islander,Female,17432,8442,4,Brazil,Platinum
1,18,Repeat,47784,Home Decor,19,Single,Sales Executive,Head,Pacific Islander,Male,15552,5898,1,United States,Bronze
2,21,New,64239,Clothing,12,Single,Software Engineer,Parent,Asian,Female,18988,2622,8,India,Platinum
3,21,New,118229,Sports,10,Widowed,Product Manager,Parent,Caucasian,Male,5447,1565,10,Canada,Bronze
4,57,Returning,59335,Home Decor,15,Widowed,Graphic Designer,Spouse,Pacific Islander,Male,12423,139,12,South Africa,Silver


In [None]:
df.isna().sum()

Unnamed: 0,0
Age,0
Customer_Type,0
Annual_Income,0
Preferred_Category,0
Purchase_Frequency,0
Marital_Status,0
Occupation,0
Household_Role,0
Ethnicity,0
Gender,0


In [None]:
df.fillna('missing', inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
Age,0
Customer_Type,0
Annual_Income,0
Preferred_Category,0
Purchase_Frequency,0
Marital_Status,0
Occupation,0
Household_Role,0
Ethnicity,0
Gender,0


In [None]:
X = df.drop('Loyalty_Status', axis=1)
y = df['Loyalty_Status']

In [None]:
X.shape

(200, 14)

In [None]:
200/5

40.0

In [None]:
40*4

160

In [None]:
y.value_counts()

Unnamed: 0_level_0,count
Loyalty_Status,Unnamed: 1_level_1
Gold,61
Platinum,48
Silver,47
Bronze,44


In [None]:
kf = KFold(n_splits=5)

In [None]:
i = 1
for train_set, test_set in kf.split(X=X):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print("-------------------------")
    i += 1

iteration  1
[ 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57
  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199]  having : 160
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]  having : 40
-------------------------
iteration  2
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  

In [None]:
num_cols = X.select_dtypes(include=np.number).columns
num_cols

Index(['Age', 'Annual_Income', 'Purchase_Frequency', 'Online_Spend',
       'InStore_Spend', 'Weekly_Visits'],
      dtype='object')

In [None]:
cat_cols = X.select_dtypes(exclude=np.number).columns
cat_cols

Index(['Customer_Type', 'Preferred_Category', 'Marital_Status', 'Occupation',
       'Household_Role', 'Ethnicity', 'Gender', 'Country'],
      dtype='object')

In [None]:
ct = ColumnTransformer([
    ('rob', RobustScaler(), num_cols),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])

In [None]:
pipe = Pipeline([
    ('ct_step', ct),
    ('model', RandomForestClassifier(n_estimators=10, random_state=0))
])

In [None]:
X.loc[40:].head()

Unnamed: 0,Age,Customer_Type,Annual_Income,Preferred_Category,Purchase_Frequency,Marital_Status,Occupation,Household_Role,Ethnicity,Gender,Online_Spend,InStore_Spend,Weekly_Visits,Country
40,19,New,31834,Beauty,11,Divorced,Consultant,Head,Asian,Female,19607,2109,10,Germany
41,27,Returning,102865,Sports,13,Single,Consultant,Child,Hispanic,Male,12301,7156,14,Brazil
42,50,Returning,82570,Toys,9,Widowed,Business Owner,Head,Hispanic,Male,4190,7355,14,Australia
43,49,VIP,93929,Clothing,2,Widowed,Consultant,Child,Pacific Islander,Male,3815,3575,1,India
44,28,VIP,76620,Sports,6,Divorced,Business Owner,Head,Middle Eastern,Female,18804,4893,9,Germany


In [None]:
scores = []
i = 1
for train_set, test_set in kf.split(X):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [None]:
np.array(scores)



array([0.25 , 0.25 , 0.175, 0.4  , 0.3  ])

In [None]:
np.array(scores).mean()


0.275

In [None]:

np.array(scores).std()

0.07416198487095664

Stratified KFold Cross Validation


In [None]:
y.value_counts()

Unnamed: 0_level_0,count
Loyalty_Status,Unnamed: 1_level_1
Gold,61
Platinum,48
Silver,47
Bronze,44


In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
scores_skf = []
i = 1
for train_set, test_set in skf.split(X, y):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores_skf.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [None]:
scores_skf

[0.275, 0.125, 0.2, 0.15, 0.375]

In [None]:
i = 1
for train_set, test_set in skf.split(X=X, y=y):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print()
    print("y train counts: \n", y[train_set].value_counts())
    print("y test counts: \n", y[test_set].value_counts())
    print("-------------------------")
    i += 1

iteration  1
[ 27  31  35  37  41  43  44  45  46  47  48  49  50  52  53  55  56  57
  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199]  having : 160
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 28 29 30 32 33 34 36 38 39 40 42 51 54]  having : 40

y train counts: 
 Loyalty_Status
Gold        49
Platinum    38
Silver      38
Bronze      35
Name: count, dtype: int64
y test counts: 
 Loyalty_Status
Gold        12
Platinum    10
Bronz

In [None]:
result_kf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=5)

In [None]:
result_kf

array([0.275, 0.125, 0.2  , 0.15 , 0.375])

Leave One Out Cross Validation

In [None]:
start = time.time()
result_kf10 = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=KFold(n_splits=10))
result_kf10
print("time taken: ", time.time()-start)

time taken:  0.5304703712463379


In [None]:
result_kf10

array([0.35, 0.25, 0.3 , 0.15, 0.3 , 0.05, 0.25, 0.2 , 0.25, 0.25])

In [None]:
start = time.time()
result_loocv = cross_val_score(estimator=pipe, X=X.head(100), y=y.head(100),
                               scoring='accuracy', cv=LeaveOneOut())
print("time taken: ", time.time()-start)

time taken:  3.803877115249634


In [None]:
result_loocv

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.])

In [None]:
result_loocv.mean()

0.18

Repeated KFold Cross Validation

In [None]:
start = time.time()
result_rkf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy',
                              cv=RepeatedKFold(n_splits=5, n_repeats=5))
result_rkf
print("time taken: ", time.time()-start)

time taken:  1.1097216606140137


In [None]:
result_rkf

array([0.275, 0.25 , 0.15 , 0.225, 0.225, 0.225, 0.125, 0.15 , 0.15 ,
       0.25 , 0.325, 0.25 , 0.275, 0.275, 0.175, 0.225, 0.175, 0.225,
       0.225, 0.25 , 0.3  , 0.35 , 0.3  , 0.275, 0.2  ])

Performing Cross validation on a training set, and then verifying our results on a seperate testset to generalize our results.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
start = time.time()
result_tts = cross_val_score(estimator=pipe, X=X_train, y=y_train,
                              scoring='accuracy', cv=KFold(n_splits=5))
print("time taken: ", time.time()-start)

time taken:  0.2985076904296875


In [None]:
result_tts

array([0.1875 , 0.1875 , 0.34375, 0.375  , 0.15625])

In [None]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
cross_val_score(estimator=pipe, X=X_train, y=y_train,
                scoring='accuracy', cv=KFold(n_splits=5))

array([0.1875 , 0.1875 , 0.34375, 0.375  , 0.15625])

In [None]:
from sklearn.metrics import get_scorer_names

scoring_metrics = sorted(get_scorer_names())
print(scoring_metrics)

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'd2_absolute_error_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'neg_root_mean_squared_log_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'r

###Tutorial

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
iris=load_iris()
X=iris.data
Y=iris.target
# print("Size of Dataset {}".format(len(X)))
# logreg=LogisticRegression()
# x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=42)
# logreg.fit(x_train,y_train)
# predict=logreg.predict(x_test)
# print("Accuracy score on training set is {}".format(accuracy_score(logreg.predict(x_train),y_train)))
# print("Accuracy score on test set is {}".format(accuracy_score(predict,y_test)))

####Monte Carlo Cross Validation

In [None]:
from sklearn.model_selection import ShuffleSplit,cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [None]:
logreg=LogisticRegression()
shuffle_split=ShuffleSplit(test_size=0.3,train_size=0.5,n_splits=10)

In [None]:
scores=cross_val_score(logreg,iris.data,iris.target,cv=shuffle_split)

In [None]:
print("cross Validation scores:n {}".format(scores))
print("Average Cross Validation score :{}".format(scores.mean()))

cross Validation scores:n [0.97777778 0.95555556 0.97777778 0.93333333 0.95555556 1.
 0.97777778 0.88888889 1.         0.93333333]
Average Cross Validation score :0.96


####Time Series Cross Validation

In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
time_series = TimeSeriesSplit()
print(time_series)
for train_index, test_index in time_series.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
TRAIN: [0] TEST: [1]
TRAIN: [0 1] TEST: [2]
TRAIN: [0 1 2] TEST: [3]
TRAIN: [0 1 2 3] TEST: [4]
TRAIN: [0 1 2 3 4] TEST: [5]


###**Task-2(25%)**
###Question-2(a)  **(15)%**
####Explain the types of cross validation techniques. Also write the pros and cons of each validation technique in detail.(7 types).

###Question-2(b)  **(10%)**
####Provide some examples of real-world problems that require cross validation techniques?

###**Task-3(30%)**
###Question-3(a)
Consider a dataset with 4 samples. Perform KFold, repeated KFold and LOO Cross validations.

In [None]:
data =[
    {'Employee ID': 101, 'Name': 'A', 'Department': 'HR', 'Salary': 55000},
    {'Employee ID': 102, 'Name': 'B', 'Department': 'Marketing', 'Salary': 68000},
    {'Employee ID': 103, 'Name': 'C', 'Department': 'IT', 'Salary': 75000},
    {'Employee ID': 104, 'Name': 'D', 'Department': 'Finance', 'Salary': 82000}]
df = pd.DataFrame(data)
print(df)
X = df.drop('Salary', axis=1)
y = df['Salary']

   Employee ID Name Department  Salary
0          101    A         HR   55000
1          102    B  Marketing   68000
2          103    C         IT   75000
3          104    D    Finance   82000


In [None]:
#K-Fold Cross VAlidation
kf = KFold(n_splits=2, shuffle=True, random_state=0)
i = 1
for train_set, test_set in kf.split(X=X):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print("-------------------------")
    i += 1

iteration  1
[0 1]  having : 2
[2 3]  having : 2
-------------------------
iteration  2
[2 3]  having : 2
[0 1]  having : 2
-------------------------


In [None]:
X.shape

(4, 3)

In [None]:
num_cols = X.select_dtypes(include=np.number).columns
num_cols

Index(['Employee ID'], dtype='object')

In [None]:
cat_cols = X.select_dtypes(exclude=np.number).columns
cat_cols

Index(['Name', 'Department'], dtype='object')

In [None]:
from sklearn.tree import DecisionTreeRegressor

ct = ColumnTransformer([
    ('rob', RobustScaler(), num_cols),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])
pipe = Pipeline([
    ('ct_step', ct),
    ('model', DecisionTreeRegressor(random_state=0))
])

In [None]:
X.loc[2:].head()

Unnamed: 0,Employee ID,Name,Department
2,103,C,IT
3,104,D,Finance


In [None]:
scores = []
i = 1
for train_set, test_set in kf.split(X):
    pipe.fit(X.loc[train_set], y.loc[train_set])
    sco = pipe.score(X.loc[test_set], y.loc[test_set])
    scores.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2


In [None]:
print(np.array(scores))
print(np.array(scores).mean())
print(np.array(scores).std())


[-45.08163265  -4.31360947]
-24.697621060258424
20.384011592802803


In [None]:
#LOO-CV
start = time.time()
result_kf10 = cross_val_score(estimator=pipe, X=X, y=y, scoring='neg_mean_squared_error', cv=KFold(n_splits=2))
result_kf10
print("time taken: ", time.time()-start)

time taken:  0.05945730209350586


In [None]:
result_kf10

array([-2.245e+08, -5.645e+08])

In [None]:
start = time.time()
result_loocv = cross_val_score(estimator=pipe, X=X.head(100), y=y.head(100),
                               scoring='neg_mean_squared_error', cv=LeaveOneOut())
print("time taken: ", time.time()-start)

time taken:  0.15284252166748047


In [None]:
result_loocv

array([-1.69e+08, -4.90e+07, -4.90e+07, -1.96e+08])

In [None]:
result_loocv.mean()

-115750000.0

In [None]:
#Repeated KFold Cross Validation
start = time.time()
result_rkf = cross_val_score(estimator=pipe, X=X, y=y, scoring='neg_mean_squared_error',
                              cv=RepeatedKFold(n_splits=2, n_repeats=5))
result_rkf
print("time taken: ", time.time()-start)

time taken:  0.2764122486114502


In [None]:
result_rkf

array([-1.825e+08, -2.845e+08, -1.825e+08, -2.845e+08, -5.645e+08,
       -2.245e+08, -4.490e+08, -1.090e+08, -1.825e+08, -2.845e+08])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
start = time.time()
result_tts = cross_val_score(estimator=pipe, X=X_train, y=y_train,
                              scoring='neg_mean_squared_error', cv=KFold(n_splits=2))
print("time taken: ", time.time()-start)

time taken:  0.06418800354003906


In [None]:
result_tts

array([-4.49e+08, -1.69e+08])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)



nan

In [None]:
cross_val_score(estimator=pipe, X=X_train, y=y_train,
                scoring='neg_mean_squared_error', cv=KFold(n_splits=2))

array([-4.49e+08, -1.69e+08])

###Question-3(b)
Consider a dataset with 6 samples and perform 3 split time-series cross validation

In [None]:
#Write your code here
data =[
    {'Employee ID': 101, 'Name': 'F', 'Department': 'HR', 'Salary': 55000},
    {'Employee ID': 102, 'Name': 'A', 'Department': 'Marketing', 'Salary': 68000},
    {'Employee ID': 103, 'Name': 'B', 'Department': 'IT', 'Salary': 75000},
    {'Employee ID': 104, 'Name': 'C', 'Department': 'Finance', 'Salary': 82000},
    {'Employee ID': 105, 'Name': 'D', 'Department': 'Finance', 'Salary': 62000},
    {'Employee ID': 106, 'Name': 'E', 'Department': 'Finance', 'Salary': 56000}
    ]
df = pd.DataFrame(data)

X = df[['Employee ID', 'Name', 'Department']]
y = df['Salary']

tscv = TimeSeriesSplit(n_splits=3)

for i, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print("Train indices:", train_index)
    print("Train set:\n", X.iloc[train_index])
    print("Test indices:", test_index)
    print("Test set:\n", X.iloc[test_index])

Train indices: [0 1 2]
Train set:
    Employee ID Name Department
0          101    F         HR
1          102    A  Marketing
2          103    B         IT
Test indices: [3]
Test set:
    Employee ID Name Department
3          104    C    Finance
Train indices: [0 1 2 3]
Train set:
    Employee ID Name Department
0          101    F         HR
1          102    A  Marketing
2          103    B         IT
3          104    C    Finance
Test indices: [4]
Test set:
    Employee ID Name Department
4          105    D    Finance
Train indices: [0 1 2 3 4]
Train set:
    Employee ID Name Department
0          101    F         HR
1          102    A  Marketing
2          103    B         IT
3          104    C    Finance
4          105    D    Finance
Test indices: [5]
Test set:
    Employee ID Name Department
5          106    E    Finance
