In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree

In [18]:
df = pd.read_csv("Fraud_check.csv")
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


# **EDA**


In [19]:
#Missing values
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [20]:
#rename

df = df.rename(columns = {"Marital.Status":"Marital_Status","Taxable.Income":"Taxable_Income","City.Population":"City_Population","Work.Experience":"Work_Experience"})

In [21]:
df['status'] = df['Taxable_Income'].apply(lambda Taxable_Income: 'Risky' if Taxable_Income <= 30000 else 'Good')
df

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban,status
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
...,...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES,Good
596,YES,Divorced,69967,55369,2,YES,Good
597,NO,Divorced,47334,154058,0,YES,Good
598,YES,Married,98592,180083,17,NO,Good


In [22]:
#Encryption
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [23]:
df["Undergrad"] = le.fit_transform(df["Undergrad"])
df["Urban"] = le.fit_transform(df["Urban"])
df["Marital_Status"] = le.fit_transform(df["Marital_Status"])
df["status"] = le.fit_transform(df["status"])
df

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban,status
0,0,2,68833,50047,10,1,0
1,1,0,33700,134075,18,1,0
2,0,1,36925,160205,30,1,0
3,1,2,50190,193264,15,1,0
4,0,1,81002,27533,28,0,0
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,0
596,1,0,69967,55369,2,1,0
597,0,0,47334,154058,0,1,0
598,1,1,98592,180083,17,0,0


In [24]:
df.drop(['Taxable_Income'], axis=1, inplace=True)

In [25]:
#separate the independent and dependent features
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

# **Ensembling technique-Random forest**

In [26]:
#random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion = "entropy",n_estimators=300,max_features=3)

In [27]:
from sklearn.model_selection import cross_val_score
res3 = cross_val_score(rf,x,y,cv = 10)

In [28]:
print(res3.mean())

0.75


# **Bagging**

In [29]:
from sklearn.model_selection import KFold

In [30]:
kf = KFold(n_splits=10,random_state=7,shuffle=True)

In [31]:
from sklearn.linear_model import LogisticRegression
m1 = LogisticRegression()

In [32]:
from sklearn.ensemble import BaggingClassifier

model1 = BaggingClassifier(base_estimator= m1, n_estimators=20,random_state=7)

In [33]:
res = cross_val_score(model1,x,y,cv = kf)



In [34]:
print(res)

[0.78333333 0.73333333 0.86666667 0.91666667 0.75       0.76666667
 0.83333333 0.66666667 0.83333333 0.78333333]


In [35]:
print(res.mean())

0.7933333333333332


# **Boosting**

In [36]:
from sklearn.ensemble import AdaBoostClassifier

In [37]:
ad = AdaBoostClassifier(n_estimators= 100,random_state=30)

In [38]:
res4 = cross_val_score(ad, x,y,cv=3)

In [39]:
print(res4.mean())

0.7766666666666667


# **Stacking**

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [41]:
estimators = []

In [42]:
m1 = LogisticRegression(max_iter = 500)
estimators.append(("logistic",m1))

In [43]:
m2 = DecisionTreeClassifier(criterion ="gini",max_depth = 3)
estimators.append(("CART",m2))

In [44]:
m3 = SVC()
estimators.append(("SVM",m3))

In [45]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('CART', DecisionTreeClassifier(max_depth=3)),
 ('SVM', SVC())]

In [46]:
ens_model = VotingClassifier(estimators)

In [47]:
res = cross_val_score(ens_model,x,y,cv =10)

In [48]:
print(res.mean())

0.7933333333333333
