In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
le=LabelEncoder()
scaler=StandardScaler()
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
from sklearn.metrics import confusion_matrix,accuracy_score
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings('ignore')

# Data Importing

In [10]:
data=pd.read_csv("C:\\Users\\Karan\\Desktop\\DBS Casual Assessments\\BankCustomers.csv").iloc[:,1:]
data

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


# Data Pre-Processing

In [11]:
data['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [12]:
data.isnull().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
#Checking the Duplicate values on CustomerId columns

In [13]:
data[data.duplicated(subset="CustomerId")==True]

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


## Deleting CustomerId and Surname columns

In [14]:
data.drop(columns=["CustomerId","Surname"],axis=1,inplace=True)

In [7]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [15]:
data["Geography"].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [16]:
x1=pd.get_dummies(data["Geography"])
x1
x2=pd.get_dummies(data["Gender"])
x2
data=pd.concat([data,x1,x2],axis=1)

In [17]:
data.drop(columns=["Geography","Gender"],axis=1,inplace=True)

In [None]:
data

In [18]:
for i in ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']:
    data[i]=scaler.fit_transform(data[[i]])

In [19]:
X=data.drop(columns=["Exited"],axis=1)
X
y=data["Exited"]
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

# Over Sampling

In [22]:
ros=RandomOverSampler(sampling_strategy='minority')
ros

# Train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
X_resample,y_resample=ros.fit_resample(X_train,y_train)
y_resample.value_counts()

1    5547
0    5547
Name: Exited, dtype: int64

# Machine Learning Model

## LogisticRegression

In [26]:
log=log.fit(X_resample,y_resample)
log

In [27]:
log_pred=log.predict(X_test)
log_pred

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

## Checking the accuracy of Logistic Regression

In [29]:
log_accu=accuracy_score(y_test,log_pred)
log_accu
#confusion_matrix(y_test,log_pred)

0.7093333333333334

## Checking the accuracy of RandomForestClassifier

In [30]:
rfc.fit(X_resample,y_resample)
rfc_pred=rfc.predict(X_test)

In [31]:
rfc_accu=accuracy_score(y_test,rfc_pred)
print(rfc_accu)
confusion_matrix(y_test,rfc_pred)

0.861


array([[2273,  143],
       [ 274,  310]], dtype=int64)

# Cross-Validation

In [32]:
from sklearn.model_selection import cross_val_score

In [34]:
print(np.mean(cross_val_score(log,X,y,cv=15,scoring='accuracy')))

0.8094004049026537


In [35]:
np.mean(cross_val_score(rfc,X,y,cv=15,scoring='accuracy'))

0.960881639420205

# Featur Selection

In [36]:
rfc.feature_importances_

array([0.13637453, 0.25506688, 0.08179332, 0.14221926, 0.126769  ,
       0.01865204, 0.03432119, 0.14023403, 0.01173584, 0.02261375,
       0.00941621, 0.01006581, 0.01073812])

In [37]:
names=X.columns
names
x1=pd.DataFrame(names,columns=['Names'])
x2=pd.DataFrame(rfc.feature_importances_,columns=["Scores"])
x2
pd.concat([x1,x2],axis=1).sort_values("Scores")

Unnamed: 0,Names,Scores
10,Spain,0.009416
11,Female,0.010066
12,Male,0.010738
8,France,0.011736
5,HasCrCard,0.018652
9,Germany,0.022614
6,IsActiveMember,0.034321
2,Tenure,0.081793
4,NumOfProducts,0.126769
0,CreditScore,0.136375


# OLM Model

In [38]:
import statsmodels.api as sms
X_olm=sms.add_constant(X_resample)
model=sms.GLM(y_resample,X_resample,family=sms.families.Binomial())
res=model.fit()
res.summary()

0,1,2,3
Dep. Variable:,Exited,No. Observations:,11094.0
Model:,GLM,Df Residuals:,11082.0
Model Family:,Binomial,Df Model:,11.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-6416.6
Date:,"Mon, 31 Oct 2022",Deviance:,12833.0
Time:,10:52:46,Pearson chi2:,11500.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
CreditScore,-0.0622,0.021,-2.912,0.004,-0.104,-0.020
Age,0.8248,0.024,34.480,0.000,0.778,0.872
Tenure,-0.0773,0.021,-3.613,0.000,-0.119,-0.035
Balance,0.1730,0.024,7.206,0.000,0.126,0.220
NumOfProducts,-0.0843,0.032,-2.599,0.009,-0.148,-0.021
HasCrCard,-0.1085,0.047,-2.324,0.020,-0.200,-0.017
IsActiveMember,-0.8931,0.044,-20.510,0.000,-0.978,-0.808
EstimatedSalary,-0.0111,0.021,-0.517,0.605,-0.053,0.031
France,-0.1072,0.037,-2.925,0.003,-0.179,-0.035


In [None]:
x1=list(dict(res.pvalues).keys())
x2=np.round(list(dict(res.pvalues).values()),3)
x11=pd.DataFrame(x1,columns=["Names"])
x22=pd.DataFrame(x2,columns=["Scores"])
final_score=pd.concat([x11,x22],axis=1).sort_values("Scores")
list(final_score[final_score["Scores"]<=0.05]["Names"])


In [None]:
X_new_reshape=X_resample[['Age',
 'Balance',
 'IsActiveMember',
 'Germany',
 'Female',
 'France',
 'CreditScore',
 'EstimatedSalary',
 'NumOfProducts',
 'Spain']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new_reshape, y_resample, test_size=0.3, random_state=42)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
rfc_pred_new=rfc.predict(X_test)

In [None]:
accuracy_score(y_test,rfc_pred_new)

In [None]:
X.shape

In [40]:
X_resample.shape

(11094, 13)

# New Exercise

In [62]:
bank=pd.read_csv("C:/Users/Karan/Desktop/Data/BankCustomers.csv").iloc[:,3:]
bank

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [67]:
x1=pd.get_dummies(bank["Geography"],drop_first=True)
x2=pd.get_dummies(bank["Gender"],drop_first=True)
#bank=pd.concat([bank,x1,x2],axis=1)

KeyError: 'Geography'

In [84]:
bank

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain,Male
0,-0.326221,0.293517,2,-1.225848,1,1,1,0.021886,1,0,0,0
1,-0.440036,0.198164,1,0.117350,1,0,1,0.216534,0,0,1,0
2,-1.536794,0.293517,8,1.333053,3,1,0,0.240687,1,0,0,0
3,0.501521,0.007457,1,-1.225848,2,0,0,-0.108918,0,0,0,0
4,2.063884,0.388871,2,0.785728,1,1,1,-0.365276,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,0.007457,5,-1.225848,2,1,0,-0.066419,0,0,0,1
9996,-1.391939,-0.373958,10,-0.306379,1,1,1,0.027988,0,0,0,1
9997,0.604988,-0.278604,7,-1.225848,1,0,1,-1.008643,1,0,0,0
9998,1.256835,0.293517,3,-0.022608,2,1,0,-0.125231,1,1,0,1


In [70]:
#bank.drop(columns=["Geography","Gender"],inplace=True)
for i in ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']:
    bank[i]=scaler.fit_transform(bank[[i]])

In [74]:
bank

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain,Male
0,-0.326221,0.293517,2,-1.225848,1,1,1,0.021886,1,0,0,0
1,-0.440036,0.198164,1,0.117350,1,0,1,0.216534,0,0,1,0
2,-1.536794,0.293517,8,1.333053,3,1,0,0.240687,1,0,0,0
3,0.501521,0.007457,1,-1.225848,2,0,0,-0.108918,0,0,0,0
4,2.063884,0.388871,2,0.785728,1,1,1,-0.365276,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.246488,0.007457,5,-1.225848,2,1,0,-0.066419,0,0,0,1
9996,-1.391939,-0.373958,10,-0.306379,1,1,1,0.027988,0,0,0,1
9997,0.604988,-0.278604,7,-1.225848,1,0,1,-1.008643,1,0,0,0
9998,1.256835,0.293517,3,-0.022608,2,1,0,-0.125231,1,1,0,1


In [77]:
X=bank.drop(columns=["Exited"])
X
y=bank["Exited"]
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [85]:
ros=RandomOverSampler(sampling_strategy='minority')
X_train_reshape,y_train_reshape=ros.fit_resample(X_train,y_train)

In [100]:
import statsmodels.api as sms
X_reshape=sms.add_constant(X_train_reshape)
model=sms.GLM(y_train_reshape,X_train_reshape,family=sms.families.Binomial())
res=model.fit()
res.summary()

0,1,2,3
Dep. Variable:,Exited,No. Observations:,11094.0
Model:,GLM,Df Residuals:,11083.0
Model Family:,Binomial,Df Model:,10.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-6429.6
Date:,"Mon, 31 Oct 2022",Deviance:,12859.0
Time:,11:23:54,Pearson chi2:,11500.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
CreditScore,-0.0883,0.021,-4.149,0.000,-0.130,-0.047
Age,0.8222,0.024,34.731,0.000,0.776,0.869
Tenure,-0.0022,0.007,-0.330,0.741,-0.015,0.011
Balance,0.1983,0.024,8.263,0.000,0.151,0.245
NumOfProducts,0.0849,0.026,3.232,0.001,0.033,0.136
HasCrCard,0.0330,0.042,0.779,0.436,-0.050,0.116
IsActiveMember,-0.8751,0.043,-20.516,0.000,-0.959,-0.791
EstimatedSalary,0.0296,0.021,1.384,0.166,-0.012,0.072
Germany,0.8140,0.053,15.251,0.000,0.709,0.919


In [None]:
'Age','Balance','NumOfProducts','IsActiveMember','Germany','Spain','Male'

In [90]:
rfc.fit(X_train_reshape,y_train_reshape)
rfc_pred=rfc.predict(X_test)
rfc_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [96]:
accuracy_score(y_test,rfc_pred)

0.8603333333333333

In [99]:
x1=pd.DataFrame(rfc.feature_importances_,columns=['Scores'])
x2=pd.DataFrame(X.columns,columns=["Names"])
pd.concat([x1,x2],axis=1).sort_values('Scores',ascending=False)

Unnamed: 0,Scores,Names
1,0.258443,Age
3,0.141216,Balance
7,0.137325,EstimatedSalary
0,0.135001,CreditScore
4,0.128784,NumOfProducts
2,0.078695,Tenure
6,0.038947,IsActiveMember
8,0.028269,Germany
10,0.020911,Male
5,0.017866,HasCrCard


# Naive Bayes Algorithm and other algorith in one code

In [202]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
le=LabelEncoder()
scaler=StandardScaler()
norm=MinMaxScaler()
norm=MinMaxScaler()
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
ros=RandomOverSampler(sampling_strategy='minority')
from sklearn.naive_bayes import GaussianNB, CategoricalNB, MultinomialNB
gnb=GaussianNB()
from sklearn.svm import SVC
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [177]:
bank=pd.read_csv("C:/Users/Karan/Desktop/Data/BankCustomers.csv").iloc[:,3:]
bank

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [178]:
X=bank.drop(columns=["Exited"])
X
y=bank["Exited"]
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [179]:
bank.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [565]:
def machine_learning(X,y):
    object_columns=X.dtypes[X.dtypes==np.object].index
    integer_columns=X.dtypes[(X.dtypes==np.int64) | (X.dtypes==np.float64)].index
    #Standardization and Normalization
    x1=input("Which Transfromation Technique you want to use, Stan/Norm::::")
    if x1=="Stan":
        for i in integer_columns:
            X[i]=scaler.fit_transform(X[[i]])
    else:
        for i in integer_columns:
            X[i]=norm.fit_transform(X[[i]])
    #Label Encoder and Dummy Variable
    xx=input("Do you want to proceed with Label Encoder? Y or N::::")
    if xx=="Y":
        for i in object_columns:
            X[i]=le.fit_transform(X[i])
    else:
        for ii in object_columns:
            x1=pd.get_dummies(X[ii],drop_first=True)
            X=pd.concat([X,x1],axis=1)
    X.drop(columns=object_columns,inplace=True)    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train_reshape,y_train_reshape=ros.fit_resample(X_train,y_train)
    #Logistic Regression - Test Data
    log.fit(X_train_reshape,y_train_reshape)
    log_pred=log.predict(X_test)
    log_accuracy=accuracy_score(y_test,log_pred)
    print()
    print("Accuracy of Logistic Regression on Test Data is ",np.round(log_accuracy,3))
    #Logistic Regression - Training Data
    log.fit(X_test,y_test)
    log_pred1=log.predict(X_train_reshape)
    log_accuracy_train=accuracy_score(y_train_reshape,log_pred1)
    print("Accuracy of Logistic Regression on Training Data is ",np.round(log_accuracy_train,3),"\n")
    #RandomForestClassification on Test Data
    rfc.fit(X_train_reshape,y_train_reshape)
    rfc_pred=rfc.predict(X_test)
    rfc_accuracy=accuracy_score(y_test,rfc_pred)
    print("Accuracy of RandomForestClassification on Test Data is ",np.round(rfc_accuracy,3))
    #RandomForestClassfication on Train Data
    rfc.fit(X_test,y_test)
    rfc_pred1=rfc.predict(X_train_reshape)
    rfc_accuracy1=accuracy_score(y_train_reshape,rfc_pred1)
    print("Accuracy of RandomForestClassification on Training Data is ",np.round(rfc_accuracy1,3),"\n")
    #NaiveBayes Classification on Test Data
    gnb.fit(X_train_reshape,y_train_reshape)
    gnb_pred=gnb.predict(X_test)
    gnb_accuracy=accuracy_score(y_test,gnb_pred)
    print("Accuracy of NaiveBayesClassification on Test Data is ",np.round(gnb_accuracy,3))
    #NaiveBayes Classification on Test Data
    gnb.fit(X_test,y_test)
    gnb_pred1=gnb.predict(X_train_reshape)
    gnb_accuracy1=accuracy_score(y_train_reshape,gnb_pred1)
    print("Accuracy of NaiveBayesClassification on Test Data is ",np.round(gnb_accuracy1,3))

In [566]:
machine_learning(X,y)

Which Transfromation Technique you want to use, Stan/Norm::::Stan
Do you want to proceed with Label Encoder? Y or N::::Y

Accuracy of Logistic Regression on Test Data is  0.7
Accuracy of Logistic Regression on Training Data is  0.507 

Accuracy of RandomForestClassification on Test Data is  0.772
Accuracy of RandomForestClassification on Training Data is  0.61 

Accuracy of NaiveBayesClassification on Test Data is  0.706
Accuracy of NaiveBayesClassification on Test Data is  0.521


In [219]:
machine_learning(X,y)

Which Transfromation Technique you want to use, Stan/Norm::::Norm

 Accuracy of Logistic Regression on Test Data is  0.696
Accuracy of Logistic Regression on Training Data is  0.562 

Accuracy of RandomForestClassification on Test Data is  0.856
Accuracy of RandomForestClassification on Training Data is  0.699 

Accuracy of NaiveBayesClassification on Test Data is  0.743
Accuracy of NaiveBayesClassification on Test Data is  0.6


In [211]:
acc=[]
ker=['linear','poly','rbf','sigmoid']
for i in ker:
    mc_scc=[]
    for j in range(50):
        model=SVC(kernel=i,C=1.1,)
        model.fit(X_train_reshape,y_train_reshape)
        svc_pred=model.predict(X_test)
        accu=accuracy_score(y_test,svc_pred)
        mc_scc.append(accu)
    acc.append(np.mean(mc_scc))
print("Accuracy of SVC on Test data is ",acc)

Accuracy of SVC on Test data is  [0.7116666666666667, 0.6966666666666665, 0.7773333333333332, 0.5376666666666666]


##### From the machine_learning function, I am selecting RandomForestClassifier

In [232]:
rfc.fit(X,y)
pred_new=rfc.predict(X.tail(20))
pred_new
accuracy_score(y.tail(20),pred_new)

1.0

In [233]:
pred_new

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0],
      dtype=int64)

In [235]:
y.tail(10)

9990    0
9991    1
9992    0
9993    0
9994    0
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, dtype: int64

In [327]:
x1=pd.DataFrame(rfc.feature_importances_,columns=["Scores"])
x2=pd.DataFrame(X.columns,columns=["Names"])
pd.concat([x1,x2],axis=1).sort_values('Scores',ascending=False).head()["Names"].values

array(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure'],
      dtype=object)

In [332]:
df=pd.read_csv('C:\\Users\\Karan\\Desktop\\DBS Casual Assessments\\BankCustomers.csv').iloc[:,3:]
df=df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure',"Exited"]]
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Exited
0,619,France,Female,42,2,1
1,608,Spain,Female,41,1,0
2,502,France,Female,42,8,1
3,699,France,Female,39,1,0
4,850,Spain,Female,43,2,0
...,...,...,...,...,...,...
9995,771,France,Male,39,5,0
9996,516,France,Male,35,10,0
9997,709,France,Female,36,7,1
9998,772,Germany,Male,42,3,1


In [341]:
X=df.drop(columns=["Exited"])
X
y=df["Exited"]
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [350]:
machine_learning(X,y)

Which Transfromation Technique you want to use, Stan/Norm::::Stan

 Accuracy of Logistic Regression on Test Data is  0.701
Accuracy of Logistic Regression on Training Data is  0.505 

Accuracy of RandomForestClassification on Test Data is  0.775
Accuracy of RandomForestClassification on Training Data is  0.605 

Accuracy of NaiveBayesClassification on Test Data is  0.706
Accuracy of NaiveBayesClassification on Test Data is  0.518


In [351]:
machine_learning(X,y)

Which Transfromation Technique you want to use, Stan/Norm::::Norm

 Accuracy of Logistic Regression on Test Data is  0.699
Accuracy of Logistic Regression on Training Data is  0.5 

Accuracy of RandomForestClassification on Test Data is  0.769
Accuracy of RandomForestClassification on Training Data is  0.606 

Accuracy of NaiveBayesClassification on Test Data is  0.709
Accuracy of NaiveBayesClassification on Test Data is  0.52


In [396]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CreditScore  10000 non-null  int64 
 1   Geography    10000 non-null  object
 2   Gender       10000 non-null  object
 3   Age          10000 non-null  int64 
 4   Tenure       10000 non-null  int64 
 5   Exited       10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 468.9+ KB


In [415]:
df.dtypes.values

array([dtype('int64'), dtype('O'), dtype('O'), dtype('int64'),
       dtype('int64'), dtype('int64')], dtype=object)

In [418]:
df.dtypes.values[1]==np.object0

True

In [421]:
for i in df.dtypes.values:
    print(i)

int64
object
object
int64
int64
int64


In [455]:
object_columns=df.dtypes[df.dtypes==np.object].index
object_columns
integer_columns=df.dtypes[(df.dtypes==np.int64) | (df.dtypes==np.float64)].index
integer_columns

Index(['CreditScore', 'Age', 'Tenure', 'Exited'], dtype='object')

In [534]:
brain=pd.read_csv("C:\\Users\\Karan\\Desktop\\DBS Casual Assessments\\brain_stroke.csv")
brain.head()
brain.shape

(4981, 11)

In [462]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [470]:
integer_columns=brain.dtypes[(brain.dtypes==np.int64) | (brain.dtypes==np.float64)].index
integer_columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke'],
      dtype='object')

In [560]:
object_columns=brain.dtypes[brain.dtypes==np.object].index
object_columns

Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')

In [561]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [562]:
def categories(X):
    xx=input("Do you want to proceed with Label Encoder? Y or N")
    if xx=="Y":
        for i in object_columns:
            X[i]=le.fit_transform(X[i])
    else:
        for ii in object_columns:
            x1=pd.get_dummies(X[ii],drop_first=True)
            X=pd.concat([X,x1],axis=1)
        X.drop(columns=object_columns,inplace=True)
    return X

In [563]:
damage=brain.copy()
damage

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [564]:
categories(damage)

Do you want to proceed with Label Encoder? Y or NN


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,Male,Yes,Private,Self-employed,children,Urban,formerly smoked,never smoked,smokes
0,67.0,0,1,228.69,36.6,1,1,1,1,0,0,1,1,0,0
1,80.0,0,1,105.92,32.5,1,1,1,1,0,0,0,0,1,0
2,49.0,0,0,171.23,34.4,1,0,1,1,0,0,1,0,0,1
3,79.0,1,0,174.12,24.0,1,0,1,0,1,0,0,0,1,0
4,81.0,0,0,186.21,29.0,1,1,1,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,41.0,0,0,70.15,29.8,0,1,0,1,0,0,0,1,0,0
4977,40.0,0,0,191.15,31.1,0,1,1,1,0,0,1,0,0,1
4978,45.0,1,0,95.02,31.8,0,0,1,0,0,0,0,0,0,1
4979,40.0,0,0,83.94,30.0,0,1,1,1,0,0,0,0,0,1


In [528]:
brain.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [554]:
damage=brain.copy()
damage.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [547]:
for ii in object_columns:
    x1=pd.get_dummies(damage[ii],drop_first=True)
    damage=pd.concat([damage,x1],axis=1)
damage.drop(columns=object_columns,inplace=True)

In [515]:
categories(brain)

Do you want to proceed with Label Encoder? Y or NY


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,1,1,228.69,36.6,1,1
1,1,80.0,0,1,1,1,0,105.92,32.5,2,1
2,0,49.0,0,0,1,1,1,171.23,34.4,3,1
3,0,79.0,1,0,1,2,0,174.12,24.0,2,1
4,1,81.0,0,0,1,1,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,1,41.0,0,0,0,1,0,70.15,29.8,1,0
4977,1,40.0,0,0,1,1,1,191.15,31.1,3,0
4978,0,45.0,1,0,1,0,0,95.02,31.8,3,0
4979,1,40.0,0,0,1,1,0,83.94,30.0,3,0


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,1,1,228.69,36.6,1,1
1,1,80.0,0,1,1,1,0,105.92,32.5,2,1
2,0,49.0,0,0,1,1,1,171.23,34.4,3,1
3,0,79.0,1,0,1,2,0,174.12,24.0,2,1
4,1,81.0,0,0,1,1,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,1,41.0,0,0,0,1,0,70.15,29.8,1,0
4977,1,40.0,0,0,1,1,1,191.15,31.1,3,0
4978,0,45.0,1,0,1,0,0,95.02,31.8,3,0
4979,1,40.0,0,0,1,1,0,83.94,30.0,3,0


In [315]:
age=int(input("Please provide your age from the range of 18-92 "))
sal=float(input('Please provide your salary'))
credit=int(input('Please provide your Credit Score from the range of 350-850'))
bal=float(input('Please provide your Balance: '))
numof=int("please provide how many number of product you hold ")



In [317]:
df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


# THANK YOU!!