In [None]:
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd 
from tqdm import tqdm



import os
for dirname, _, filenames in os.walk('/kaggle/input/playground-series-s3e3'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e3/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e3/test.csv')
df = pd.concat([train, test])


In [None]:
df.duplicated().sum()

In [None]:
for i in df.columns:
    print(df[i].value_counts())

In [None]:
df.describe()

In [None]:
df = pd.get_dummies(df)

In [None]:
train = df[df.Attrition.notnull()]
test = df[df.Attrition.isnull()]

In [None]:
y = pd.DataFrame(train.Attrition)
train.drop(columns = 'Attrition', inplace = True)

In [None]:
trainx, testx, trainy, testy = train_test_split(train, y, test_size = 0.35, random_state = 100)


In [None]:
trainxcon = trainx.loc[:, ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']]
testxcon = testx.loc[:, ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']]


In [None]:
trainx.drop(columns = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], inplace = True )
testx.drop(columns = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], inplace = True )

In [None]:
ss = StandardScaler()
trainxcon = pd.DataFrame(ss.fit_transform(trainxcon), columns = trainxcon.columns, index = trainx.index)
testxcon = pd.DataFrame(ss.transform(testxcon), columns = testxcon.columns, index = testx.index)


In [None]:
trainx = trainx.merge(trainxcon, left_index=True, right_index=True)
testx = testx.merge(testxcon, left_index=True, right_index=True)

In [None]:
trainx.set_index('id', inplace = True)
testx.set_index('id', inplace = True)

In [None]:
model = LogisticRegression(max_iter = 100000)
model.fit(trainx, trainy.values.ravel())
model.score(testx, testy)


## The best model

In [None]:
model = DecisionTreeClassifier()
min_samples_split = np.array(range(1, 150))
max_depth = np.array(range(1, 150))
criterion = ['entropy', 'gini']
param = {'min_samples_split': min_samples_split, 'max_depth': max_depth, 'criterion': criterion}
rdecision = RandomizedSearchCV(model, param, scoring = 'accuracy', cv = 50)
n_iter = rdecision.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rdecision.set_params(n_iter = 1)
        rdecision.fit(trainx, trainy)
        pbar.update()
print(rdecision.best_params_)
        

In [None]:
model = DecisionTreeClassifier(min_samples_split = 57, max_depth = 136, criterion = 'entropy', random_state = 100)
model.fit(trainx, trainy)
model.score(testx, testy)

In [None]:
model = RandomForestClassifier()
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1, 100))
criterion = ['entropy', 'gini']
n_estimators = np.array(range(1, 200))
param = {'min_samples_split': min_samples_split, 'max_depth': max_depth, 'criterion': criterion, 'n_estimators': n_estimators}
rrandom = RandomizedSearchCV(model, param, cv= 50, scoring = 'accuracy')
n_iter = rrandom.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rrandom.set_params(n_iter = 1)
        rrandom.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rrandom.best_params_)


In [None]:
model = RandomForestClassifier(min_samples_split = 30, max_depth = 43, criterion = 'entropy',n_estimators = 8,  random_state = 100)
model.fit(trainx, trainy.values.ravel())
model.score(testx, testy)

In [None]:
model = ExtraTreesClassifier()
min_samples_split = np.array(range(1, 100))
max_depth = np.array(range(1, 100))
criterion = ['entropy', 'gini']
n_estimators = np.array(range(1, 200))
param = {'min_samples_split': min_samples_split, 'max_depth': max_depth, 'criterion': criterion, 'n_estimators': n_estimators}
rextra = RandomizedSearchCV(model, param, cv= 50, scoring = 'accuracy')
n_iter = rextra.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rextra.set_params(n_iter = 1)
        rextra.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rextra.best_params_)



In [None]:
model = ExtraTreesClassifier(min_samples_split = 79, max_depth = 47, criterion = 'gini',n_estimators = 107,  random_state = 100)
model.fit(trainx, trainy.values.ravel())
model.score(testx, testy)

In [None]:
model = BaggingClassifier()
max_features = np.array(range(0, 56))
n_estimators = np.array(range(1, 200))
param = {'max_features': max_features, 'n_estimators': n_estimators}
rbagging = RandomizedSearchCV(model, param, cv = 50, scoring = 'accuracy')
n_iter = rbagging.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rbagging.set_params(n_iter = 1)
        rbagging.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rbagging.best_params_)


In [None]:
model = BaggingClassifier(max_features = 13, n_estimators = 46,  random_state = 100)
model.fit(trainx, trainy.values.ravel())
model.score(testx, testy)

In [None]:
model = AdaBoostClassifier()
learning_rate = np.logspace(-6, 0, 100)
n_estimators = np.array(range(1, 100))
param = {'learning_rate': learning_rate , 'n_estimators': n_estimators}
rada = RandomizedSearchCV(model, param, cv = 50, scoring = 'accuracy')
n_iter = rada.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rada.set_params(n_iter = 1)
        rada.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rada.best_params_)

In [None]:
model = AdaBoostClassifier( learning_rate = 0.49770235643321137 , n_estimators = 62 )
model.fit(trainx, trainy.values.ravel())
model.score(testx, testy)

In [None]:
model = SVC()
kernel = ['rbf', 'sigmoid', 'linear']
C = np.linspace(1, 20, 20)
gamma = np.linspace(1, 100, 20)
param = {'C': C, 'gamma': gamma, 'kernel': kernel}
rsvc = RandomizedSearchCV(model, param, cv = 50)
n_iter = rsvc.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rsvc.set_params(n_iter = 1)
        rsvc.fit(trainx, trainy.values.ravel())
        pbar.update()
print(rsvc.best_params_)

In [None]:
model = SVC(gamma = 0.01, C = 10)
model.fit(trainx, trainy.values.ravel())
model.score(testx, testy)

In [None]:
model = XGBClassifier()
booster = ['gbtree', 'dart']
eta = np.linspace(0.01, 0.3, 10)
gamma = np.array(range(1, 100))
max_depth = np.array(range(1, 10))
objective = ['reg:squarederror', 'reg:squaredlogerror']
alpha= np.array(range(1, 50))
reg_lambda = np.linspace(0, 1, 10)
colsample_bytree = np.linspace(0.5,1, 5)
min_child_weight = np.array(range(0,10))
n_estimators = np.array(range(1,100))
param = {'booster': booster, 'eta': eta, 'gamma': gamma, 'max_depth': max_depth, 'objective': objective, 'alpha': alpha,'reg_lambda': reg_lambda,
        'colsample_bytree': colsample_bytree, 'min_child_weight': min_child_weight, 'n_estimators': n_estimators}
rxgb = RandomizedSearchCV(model, param, cv = 50, scoring = 'accuracy')
n_iter = rxgb.n_iter
with tqdm(total = n_iter) as pbar:
    for i in range(n_iter):
        rxgb.set_params(n_iter = 1)
        rxgb.fit(trainx, trainy)
        pbar.update()
print(rxgb.best_params_)

In [None]:
model = XGBClassifier(booster = 'gbtree', eta = 0.1711111111, gamma = 53 , max_depth = 1, objective = 'reg:squarederror',reg_lambda = 0.77777777, n_estimators = 85, 
                     min_child_weight = 4, alpha = 12, colsample_bytree = 1, random_state = 100 )
model.fit(trainx, trainy)
model.score(testx, testy)

In [None]:
y = pd.DataFrame(test.Attrition)
test.drop(columns = 'Attrition', inplace = True)

In [None]:
testcon = test.loc[:, ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']]
test.drop(columns = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], inplace = True )
testcon = pd.DataFrame(ss.transform(testcon), columns = testcon.columns, index = test.index)
test = test.merge(testcon, left_index=True, right_index=True)
test.set_index('id', inplace = True)

In [None]:
predy = pd.DataFrame(model.predict(test), index = test.index)