In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection  import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
import scipy
from scipy import stats
from statsmodels.api import add_constant
import statsmodels.discrete.discrete_model as sm
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.interpolate import interp1d
from sklearn.preprocessing import imputer

In [2]:
data=pd.read_csv("loans.csv")
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,annual_inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not_fully_paid
0,1,debt_consolidation,0.1189,829.1,85000.00038,19.48,737,5639.958333,28854,52.1,0.0,0.0,0.0,0
1,1,credit_card,0.1071,228.22,65000.00007,14.29,707,2760.0,33623,76.7,0.0,0.0,0.0,0
2,1,debt_consolidation,0.1357,366.86,31999.99994,11.63,682,4710.0,3511,25.6,1.0,0.0,0.0,0
3,1,debt_consolidation,0.1008,162.34,85000.00038,8.1,712,2699.958333,33667,73.2,1.0,0.0,0.0,0
4,1,credit_card,0.1426,102.92,80799.99964,14.97,667,4066.0,4740,39.5,0.0,1.0,0.0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
annual_inc           9574 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9549 non-null float64
revol.bal            9578 non-null int64
revol.util           9516 non-null float64
inq.last.6mths       9549 non-null float64
delinq.2yrs          9549 non-null float64
pub.rec              9549 non-null float64
not_fully_paid       9578 non-null int64
dtypes: float64(9), int64(4), object(1)
memory usage: 1.0+ MB


In [4]:
data.describe(include='all')

Unnamed: 0,credit.policy,purpose,int.rate,installment,annual_inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not_fully_paid
count,9578.0,9578,9578.0,9578.0,9574.0,9578.0,9578.0,9549.0,9578.0,9516.0,9549.0,9549.0,9549.0,9578.0
unique,,7,,,,,,,,,,,,
top,,debt_consolidation,,,,,,,,,,,,
freq,,3957,,,,,,,,,,,,
mean,0.80497,,0.12264,319.089413,68383.06,12.606679,710.846314,4562.026085,16913.96,46.865677,1.571578,0.163787,0.062101,0.160054
std,0.396245,,0.026847,207.071301,61222.32,6.88397,37.970537,2497.985733,33756.19,29.018642,2.198095,0.546712,0.262152,0.366676
min,0.0,,0.06,15.67,1896.0,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,,0.1039,163.77,38500.0,7.2125,682.0,2820.0,3187.0,22.7,0.0,0.0,0.0,0.0
50%,1.0,,0.1221,268.95,55714.0,12.665,707.0,4139.958333,8596.0,46.4,1.0,0.0,0.0,0.0
75%,1.0,,0.1407,432.7625,80004.0,17.95,737.0,5730.0,18249.5,71.0,2.0,0.0,0.0,0.0


In [5]:
data.isnull().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
annual_inc            4
dti                   0
fico                  0
days.with.cr.line    29
revol.bal             0
revol.util           62
inq.last.6mths       29
delinq.2yrs          29
pub.rec              29
not_fully_paid        0
dtype: int64

In [6]:
data["annual_inc"].median()

55714.00004

In [7]:
data["annual_inc"].fillna(data["annual_inc"].median(), inplace = True) 

In [8]:
data.isnull().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
annual_inc            0
dti                   0
fico                  0
days.with.cr.line    29
revol.bal             0
revol.util           62
inq.last.6mths       29
delinq.2yrs          29
pub.rec              29
not_fully_paid        0
dtype: int64

In [9]:
data["days.with.cr.line"].fillna(data["days.with.cr.line"].median(), inplace = True) 

In [10]:
data.isnull().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
annual_inc            0
dti                   0
fico                  0
days.with.cr.line     0
revol.bal             0
revol.util           62
inq.last.6mths       29
delinq.2yrs          29
pub.rec              29
not_fully_paid        0
dtype: int64

In [11]:
data["revol.util"].fillna(data["revol.util"].median(), inplace = True) 

In [12]:
data.isnull().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
annual_inc            0
dti                   0
fico                  0
days.with.cr.line     0
revol.bal             0
revol.util            0
inq.last.6mths       29
delinq.2yrs          29
pub.rec              29
not_fully_paid        0
dtype: int64

In [13]:
data["inq.last.6mths"].fillna(data["inq.last.6mths"].median(), inplace = True) 


In [14]:
data.isnull().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
annual_inc            0
dti                   0
fico                  0
days.with.cr.line     0
revol.bal             0
revol.util            0
inq.last.6mths        0
delinq.2yrs          29
pub.rec              29
not_fully_paid        0
dtype: int64

In [15]:
data["delinq.2yrs"].fillna(data["delinq.2yrs"].median(), inplace = True) 

In [16]:
data.isnull().sum()

credit.policy         0
purpose               0
int.rate              0
installment           0
annual_inc            0
dti                   0
fico                  0
days.with.cr.line     0
revol.bal             0
revol.util            0
inq.last.6mths        0
delinq.2yrs           0
pub.rec              29
not_fully_paid        0
dtype: int64

In [17]:
data["pub.rec"].fillna(data["pub.rec"].median(), inplace = True) 

In [18]:
data.isnull().sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
annual_inc           0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not_fully_paid       0
dtype: int64

In [19]:
df=pd.get_dummies(data,drop_first=True) # drop first Categorical data and create dummy variable
df.head()

Unnamed: 0,credit.policy,int.rate,installment,annual_inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not_fully_paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,85000.00038,19.48,737,5639.958333,28854,52.1,0.0,0.0,0.0,0,0,1,0,0,0,0
1,1,0.1071,228.22,65000.00007,14.29,707,2760.0,33623,76.7,0.0,0.0,0.0,0,1,0,0,0,0,0
2,1,0.1357,366.86,31999.99994,11.63,682,4710.0,3511,25.6,1.0,0.0,0.0,0,0,1,0,0,0,0
3,1,0.1008,162.34,85000.00038,8.1,712,2699.958333,33667,73.2,1.0,0.0,0.0,0,0,1,0,0,0,0
4,1,0.1426,102.92,80799.99964,14.97,667,4066.0,4740,39.5,0.0,1.0,0.0,0,1,0,0,0,0,0


In [20]:
df.shape

(9578, 19)

In [21]:
data.shape

(9578, 14)

In [22]:
x=df.iloc[:,[0,2,4,5,7,9,10,11,13,14,15,16,17,18]]
y=df["not_fully_paid"]


In [23]:
type(y)

pandas.core.series.Series

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1) # test size is for proportion
lg=LogisticRegression()   # logistic Regression
lg.fit(x_train,y_train)      
pred_train=lg.predict(x_train)      # predict
cm=pd.crosstab(y_train,pred_train)
cm

col_0,0,1
not_fully_paid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5631,17
1,1046,10


In [25]:
pred1=lg.predict(x_test)
pred1


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
predprob=lg.predict_proba(x_test) # predict the probability instead of 1's and 0's directly 

predprob1=predprob[:,1]
x=[]
for i in predprob1:
    if i>0.4:                      # Create a threshold 
        x.append(1)
    else:
        x.append(0)
    

In [28]:
pd.crosstab(y_train,np.array(x))  #convert it into array

ValueError: Shape of passed values is (2, 2874), indices imply (2, 6704)

In [None]:
predprob[:,1]

In [None]:
type(pred1)

In [None]:
acc=(5631+10)/x_train.shape[0]   # accuracy
acc


In [None]:
cm1=pd.crosstab(y_test,pred1)
type(cm1)
cm1

In [None]:
acc=(2393+11)/x_test.shape[0]   # accuracy
acc


In [None]:
lg.score(x_test,y_test)

In [None]:
lg.score(x_train,y_train)

In [None]:

X2 = add_constant(x_train)

logit=sm.Logit(y_train,X2)


In [None]:
result=logit.fit()

In [None]:
print(result.summary())

In [None]:
df.head()

In [None]:
df.groupby("not_fully_paid")["int.rate","pub.rec"].agg([np.mean,np.std])

In [None]:
df["not_fully_paid"].value_counts()

# Day 4

In [None]:
from sklearn import metrics

In [None]:
fpr,tpr,threshold=metrics.roc_curve(y_test,predprob[:,1])   # fpr -False posititve Ratio, tpr- True positive ratio
plt.plot(fpr,tpr,label="ROC curve",color="b")
plt.axes().set_aspect("equal")
plt.xlim([-0.05,1.05])  # if we donot mention this the graph will start from 0 and end in 1
plt.ylim([-0.05,1.05])  # if we donot mention this the graph will start from 0 and end in 1

# Titanic Data


In [None]:
df1=pd.read_csv("Titanic_train.csv")
df2=pd.read_csv("Titanic_test.csv")


In [None]:
df2["Survived"]=-111

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df=pd.concat([df1,df2],axis=0,sort=False)

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
df.isnull().sum()

In [None]:
df3=df[df["Fare"].isna()]
df3


In [None]:
df.Sex.value_counts()

In [None]:
df.groupby(['Pclass', 'Embarked']).Fare.median()

In [None]:
median_fare = df.loc[(df.Pclass == 3) & (df.Embarked == 'S'),'Fare'].median()
print(median_fare)

In [None]:
df["Fare"].fillna(median_fare,inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df[df["Embarked"].isna()]

In [None]:
df[df["Cabin"]=="B28"]

In [None]:
a=df[df["Pclass"]==1]
a[a["Fare"]==80].Embarked.mode()

In [None]:
df["Embarked"].fillna("C",inplace=True)

In [None]:
df[df.Age.isnull()]

In [None]:
def GetTitle(name):
    first_name_with_title = name.split(',')[1]
    title = first_name_with_title.split('.')[0]
    title = title.strip().lower()
    return title

In [None]:
df.Name.map(lambda x : GetTitle(x))

In [None]:
df['Title'] =  df.Name.map(lambda x : GetTitle(x))
df.Title

In [None]:
title_age_median = df.groupby('Title').Age.transform('median')
df.Age.fillna(title_age_median , inplace=True)

In [None]:
del(df["Cabin"])

In [None]:
df.info()

In [None]:
df1.shape

In [None]:
df2.shape


In [None]:
del(df["Name"])

In [None]:
del(df["Title"])

In [None]:
del(df["Ticket"])

In [None]:
newdata=pd.get_dummies(df,drop_first=True) # drop first Categorical data and create dummy variable
newdata.head()

In [None]:
dfn=newdata

In [None]:
dfn.shape

In [None]:
train=dfn.head(891)
test=dfn.tail(418)



In [None]:
x=train.iloc[:,[2,3,4,6,7]]
y=train["Survived"]


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
lg=LogisticRegression()   # logistic Regression
lg.fit(x_train,y_train)      
pred_train=lg.predict(x_train)      # predict
cm=pd.crosstab(y_train,pred_train)
cm

In [None]:
pred1=lg.predict(x_test)
pred1


In [None]:
cm=pd.crosstab(y_test,pred1)
cm

In [None]:
lg.score(x_test,y_test)

In [None]:
acc=(135+74)/x_test.shape[0]
acc

In [None]:
x_train=train.iloc[:,[2,3,4,6,7]]
x_test=test.iloc[:,[2,3,4,6,7]]
y_train=train["Survived"]
y_test=test["Survived"]

In [None]:
x_train

In [None]:
lg=LogisticRegression()   # logistic Regression
lg.fit(x_train,y_train)      
 

In [None]:
survived_pred=lg.predict(x_test)
survived_pred.sum()

In [None]:
test["Survived"]=survived_pred

In [None]:
test.head()

In [None]:
test_file=test[["PassengerId","Survived"]]

In [None]:
test_file.to_csv("Titanic.csv",index=False)

# Day 4

In [None]:
df=pd.read_csv("parole1.1.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
def get(x):
    if x==1:
        y="Male"
    else:
        y="Female"
    return y

In [None]:
y=df.male.map(lambda x: get(x))    
df["Gender"]=y

In [None]:
df.head(10)

In [None]:
df.state.nunique()

In [None]:
def get(x):
    if x==1:
        y="State1"
    elif x==2:
        y="State2"
    elif x==3:
        y="State3"
    elif x==4:
        y="State4"
    return y

In [None]:
y=df.state.map(lambda x: get(x))    
df["state"]=y

In [None]:
df.tail(10)

In [None]:
del(df["male"])

In [None]:
df.race.nunique()

In [None]:
def get(x):
    if x==1:
        y="Race1"
    else:
        y="Race2"
    
    return y

In [None]:
y=df.race.map(lambda x: get(x))    
df["race"]=y

In [None]:
df.head()

In [None]:
df.crime.nunique()

In [None]:
def get(x):
    if x==1:
        y="Crime1"
    elif x==2:
        y="Crime2"
    elif x==3:
        y="Crime3"
    elif x==4:
        y="Crime4"
    return y

In [None]:
y=df.crime.map(lambda x: get(x))    
df["crime"]=y

In [None]:
df=pd.get_dummies(df,drop_first=True) # drop first Categorical data and create dummy variable
df.head()
df.shape


In [None]:
df.head()

In [None]:
x=df.iloc[:,[1,2,3,5,6,7,8,9,10,11,12]]
y=df["violator"]

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1) # test size is for proportion , if we dont provide test_size then by default it will take 0.25
lg=LogisticRegression()   # logistic Regression
lg.fit(x_train,y_train)      
pred_train=lg.predict(x_train)      # predict
cm=pd.crosstab(y_train,pred_train)
cm

In [None]:
pred_test=lg.predict(x_test)  

In [None]:
cm=pd.crosstab(y_test,pred_test)
cm


In [None]:
lg.score(x_test,y_test)

In [None]:
predprob=lg.predict_proba(x_test)

In [None]:
from sklearn import metrics

In [None]:

fpr,tpr,threshold=metrics.roc_curve(y_test,predprob[:,1])   # fpr -False posititve Ratio, tpr- True positive ratio
plt.plot(fpr,tpr,label="ROC curve",color="b")
plt.axes().set_aspect("equal")
plt.xlim([-0.05,1.05])  # if we donot mention this the graph will start from 0 and end in 1
plt.ylim([-0.05,1.05])  # if we donot mention this the graph will start from 0 and end in 1

In [None]:
# Area Under the curve
metrics.auc(fpr,tpr)

In [None]:
tp=cm[1][1]
tn=cm[0][0]
fn=cm[0][1]
fp=cm[1][0]

accu=((tn+tp)/(tp+tn+fp+fn))
error=((fn+fp)/(tp+tn+fp+fn))
sensi=(tp/(tp+fn))
preci=(tp/(tp+fp))
speci=(tn/(tn+fp))

print("Accuracy:",accu)
print("Error:",error)
print("Sensitivity:",sensi)
print("Precision:",preci)
print("Specificity:",speci)

In [None]:
predprob1=predprob[:,1]
x=[]
for i in predprob1:
    if i>0.4:                      # Create a threshold 
        x.append(1)
    else:
        x.append(0)

In [None]:
# Area Under the curve
metrics.auc(fpr,tpr)