#### BoxPlot

In [18]:
tr <- read.csv('train.csv')
png(file = "boxplot.png")
boxplot(LoanAmount ~ Credit_History,data = tr,xlab = "Credit History", ylab = "Loan Amount", main='Loan Data')
dev.off()

#### Histogram

In [20]:
tr <- read.csv('train.csv')
income <-  tr[,c('ApplicantIncome')]

# Give the chart file a name.
png(file = "histogram.png")

# Create the histogram.
hist(income,xlab = "Income",col = "yellow",border = "blue")
dev.off()

#### Scatter Plot

In [24]:
tr <- read.csv('train.csv')
png(file = "Scatter.png")
plot(tr$ApplicantIncome,tr$LoanAmount,
     main="Relationship Between ApplicantIncome and LoanAmount",
     xlab="ApplicantIncome",
     ylab="LoanAmount")
dev.off()

#### Strip Chart

In [25]:
tr <- read.csv('train.csv')
set.seed(100)
x = list("ApplicantIncome" = tr$ApplicantIncome[0:50] , "CoapplicantIncome" = tr$CoapplicantIncome[0:50])
png("Stripchart.png")
stripchart(x,
           main = "Comparision between ApplicantIncome and CoapplicantIncome",
           method = "jitter",
           col = c("orange","green"),
           pch = 1)
dev.off()

### Data Pre-Processing

#### Missing Values

In [146]:
tr <- read.csv('train.csv')
# tr <- read.csv(file="train.csv", na.strings=c("", "NA"), header=TRUE)
# colnames(tr) <- c("Loan_ID","Gender","Married","Dependents","Education","Self_Employed","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History","Property_Area","Loan_Status")

write.csv(sapply(tr,function(x)sum(is.na(x))),file="output.txt")

#### Deleting Missing Rows

In [147]:
tr <- read.csv('train.csv')
write.csv(tr[complete.cases(tr) , ],file="output_01.csv")

#### Change Column Value

In [148]:
tr <- read.csv('train.csv')
library(plyr)
tr$Dependents <- revalue(tr$Dependents, c("3+"="3"))
write.csv(tr,file = "output_02.csv")

### Logistic Regression Model

#### Correlation

In [149]:
tr <- read.csv("loan.csv")
library(plyr)

tr <- tr[complete.cases(tr),]
v <- tr[,c("Dependents","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History")]


#colnames(diabetes) <- c("Pregnancy","Glucose","BP","Thick","Insulin","BMI","Pedigree","Age","Isdiabetes")

#sapply(diabetes,function(x)sum(is.na(x)))

#summary(diabetes)
#str(diabetes)

#table(diabetes$Isdiabetes)

write.csv(cor(v[2:6]),file="output_06.csv")

#### Multi-collinearity

In [150]:
tr <- read.csv("loan.csv")
#colnames(tr) <- c("Loan_ID","Gender","Married","Dependents","Education","Self_Employed","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History","Property_Area","Loan_Status")
#tr<- tr[,3:10]

model1 <- glm(Target ~.,data = tr, family = binomial(link="logit"))

library(car)
write.csv(vif(model1),file = "Output_07.csv")


#### AIC/BIC of a glm model.

In [151]:
#) AIC/BIC of a glm model.
tr <- read.csv("train.csv",stringsAsFactors=TRUE)
library(plyr)
tr$Dependents <- revalue(tr$Dependents, c("3+"="3"))
colnames(tr) <- c("Loan_ID","Gender","Married","Dependents","Education","Self_Employed","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History","Property_Area","Loan_Status")

model1 <- glm(tr$Loan_Status ~.,data = tr, family = binomial(link="logit"))

a<-c("AIC :",AIC(model1),"\nBIC :",BIC(model1))
x = c(a )
write.csv(toString(x),file="output_08.csv")

"glm.fit: algorithm did not converge"


#### Residuals

In [152]:
library(ggplot2)
library(lattice)
library(caret)
library(e1071)

        tr <- read.csv("train.csv",stringsAsFactors=TRUE)
library(plyr)
tr$Dependents <- revalue(tr$Dependents, c("3+"="3"))
colnames(tr) <- c("Loan_ID","Gender","Married","Dependents","Education","Self_Employed","ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term","Credit_History","Property_Area","Loan_Status")
tr <- tr[complete.cases(tr),]
model1 <- glm(tr$Loan_Status ~.,data = tr, family = binomial(link="logit"))

library(dplyr)

p <-(predict(model1,select=-tr$Loan_Status,type="response"))
p[p<=0.5] <- 0
p[p>0.5] <- 1

png(file="residuals.png")

plot(model1$residuals,xlab="Index",ylab = "Residuals")

dev.off()

"glm.fit: algorithm did not converge"


### Decision Tree

#### Construct a decision tree

In [153]:
library(caret)
library(rpart)
set.seed(9999)

l <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(l$Target, p=split, list=FALSE)
data_train <- l[ trainIndex,]
data_test <- l[-trainIndex,]

model <- rpart(Target~., data=data_train )

x_test <- data_test[,1:10]

y_test <- data_test[,11]

p <- predict(model, x_test)
p[p<=0.5] <- 0
p[p>0.5] <- 1


write.csv(p,file="output_12.csv")

#### Confusion Matrix - Decision Tree

In [154]:
library(caret)
library(rpart)
library(e1071)
set.seed(9999)

l <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(l$Target, p=split, list=FALSE)
data_train <- l[ trainIndex,]
data_test <- l[-trainIndex,]

model <- rpart(Target~., data=data_train )

x_test <- data_test[,1:10]

y_test <- data_test[,11]

p <- predict(model, x_test)
p[p<=0.5] <- 0
p[p>0.5] <- 1

z<-confusionMatrix(factor(p,level=0:1), factor(y_test,level=0:1))
#cat("Accuracy :",format(confusionMatrix(p, y_test)$overall[1],digit=2))

write.csv(confusionMatrix(factor(p,level=0:1), factor(y_test,level=0:1))$table,file="output_15.csv")

#### Decision Tree

In [155]:
library(caret)
library(rpart)
png(file = "output_13.png")
set.seed(9999)
i <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(i$Target, p=split, list=FALSE)
data_train <- i[ trainIndex,]
data_test <- i[-trainIndex,]

model <- rpart(data_train$Target~., data=data_train)

plot(model)
text(model, pretty=0)
dev.off()


#### Atribute Evaluation

In [156]:
library(CORElearn)
library(caret)
library(rpart)

set.seed(9999)
i <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(i$Target, p=split, list=FALSE)
data_train <- i[ trainIndex,]
data_test <- i[-trainIndex,]

model <- rpart(data_train$Target~., data=data_train)
IG.CORElearn <- attrEval(Target ~ ., data=i,  estimator = "InfGain")
write.csv(IG.CORElearn,file = "output_14.csv")

Changing dependent variable to factor with levels: 0 1 


"Possibly this is an error caused by regression formula and classification attribute estimator or vice versa."


#### Accuracy-Decision Tree

In [157]:
library(caret)
library(rpart)
set.seed(9999)

l <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(l$Target, p=split, list=FALSE)
data_train <- l[ trainIndex,]
data_test <- l[-trainIndex,]

model <- rpart(Target~., data=data_train )

x_test <- data_test[,1:10]

y_test <- data_test[,11]

p <- predict(model, x_test)
p[p<=0.5] <- 0
p[p>0.5] <- 1

z<-confusionMatrix(factor(p,level=0:1), factor(y_test,level=0:1))
#cat("Accuracy :",format(confusionMatrix(p, y_test)$overall[1],digit=2))

write.csv(z$overall[1],file="output_16.csv")

### Ramdom Forest

#### Random Forest

In [158]:
library(caret)
library(rpart)
library(randomForest)
set.seed(9999)

l <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(l$Target, p=split, list=FALSE)
data_train <- l[ trainIndex,]
data_test <- l[-trainIndex,]

fit.forest2 <- randomForest(Target ~ ., data=data_train,importance=TRUE)
fit.forest2
write.csv(importance(fit.forest2,type = 2),file="output_17.csv")

"The response has five or fewer unique values.  Are you sure you want to do regression?"



Call:
 randomForest(formula = Target ~ ., data = data_train, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 3

          Mean of squared residuals: 0.1425512
                    % Var explained: 27.84

#### Confusion Matrix

In [159]:
library(caret)
library(rpart)
library(randomForest)
set.seed(9999)

l <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(l$Target, p=split, list=FALSE)
data_train <- l[ trainIndex,]
data_test <- l[-trainIndex,]

fit.forest2 <- randomForest(Target ~ ., data=data_train,importance=TRUE)

forest.p <- predict(fit.forest2,data_train)
forest.p[forest.p<=0.5] <- 0
forest.p[forest.p>0.5] <- 1
forest.pt <- table(data_train$Target , forest.p , dnn = c("Actual","Predicted"))

write.csv(forest.pt,file="output_18.csv")

"The response has five or fewer unique values.  Are you sure you want to do regression?"


#### Accuracy

In [160]:
library(caret)
library(rpart)
library(randomForest)
set.seed(9999)

l <- read.csv("loan.csv")
split=0.80
trainIndex <- createDataPartition(l$Target, p=split, list=FALSE)
data_train <- l[ trainIndex,]
data_test <- l[-trainIndex,]

fit.forest2 <- randomForest(Target ~ ., data=data_train,importance=TRUE)

forest.p <- predict(fit.forest2,data_train)
forest.p[forest.p<=0.5] <- 0
forest.p[forest.p>0.5] <- 1
forest.pt <- table(data_train$Target , forest.p , dnn = c("Actual","Predicted"))

write.csv(confusionMatrix(forest.pt)$overall[1],file="output_19.csv")

"The response has five or fewer unique values.  Are you sure you want to do regression?"
