In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# libraries used for graph and visulization
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

# libraries used for missing value 
import missingno as msno 

#libraries used for preprocessing and modeling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score, precision_score, roc_curve, f1_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV





In [None]:
df = pd.read_csv('Lending_Club_Data.csv')

In [None]:
df.describe()

In [None]:
loan_df =df.copy()

In [None]:
loan_df.isna().sum()

In [None]:
msno.matrix(loan_df)

In [None]:
msno.bar(loan_df)

In [None]:
loan_df.head()

In [None]:
loan_df.emp_title.value_counts()

In [None]:
loan_df.emp_title.value_counts().tail()

In [None]:
loan_df.emp_title.unique().shape

In [None]:
import pandas as pd

# Create a DataFrame
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Drop column 'B'
df.drop(columns=['B'], inplace=True)
df


In [None]:
# Create a DataFrame with missing values
data = {'A': [1, 2, None], 'B': [4, None, 6]}
df = pd.DataFrame(data)

# Drop rows with any NaN values
df.dropna(inplace=True)
df

In [None]:
import pandas as pd
import numpy as np

# Create a DataFrame with missing values
data = {'A': [1, 2, None], 'B': [4, None, 6], 'C': [None, 8, 9]}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Drop columns with any NaN values
df.dropna(axis=1, inplace=True)

# Display the modified DataFrame
print("\nDataFrame after dropping columns with NaN values:")
print(df)


In [None]:
import numpy as np
df = pd.DataFrame(np.arange(25).reshape(5, 5),  
                      index=list('abcde'), 
                      columns=['x','y','z', 'a', 'b'])
df

In [None]:
df.loc['c': 'd' , :'z']

In [None]:
df.iloc[:, 3] 

In [None]:
#data won’t be meaningful and any relationship we might observe might be due to confounding relationships
#A more advanced implementation might look to group all these job descriptions into categories and/or examine
#if Lending Club’s model looks at (annual_inc + emp_title) versus just annual_inc
loan_df.drop(['emp_title'],1, inplace=True)

In [None]:
# droping columns having misleading values, missing values > 50%, indentity values, encrypted values
loan_df.drop(['mths_since_last_delinq','mths_since_last_record','collections_12_mths_ex_med','Notes','purpose','earliest_cr_line','Id','collections_12_mths_ex_med','initial_list_status','zip_code'], axis=1,inplace=True)

In [None]:
loan_df.emp_length.value_counts()

In [None]:
#filling na values with avreage values
loan_df['emp_length'] = pd.to_numeric(loan_df['emp_length'], errors='coerce')
avg_value=loan_df['emp_length'].median()
loan_df['emp_length'] =loan_df['emp_length'].fillna(avg_value)
loan_df

In [None]:
loan_df.isnull().sum()

In [None]:
#VERIFIED - income and VERIFIED - income source could potentially be the same criteria
loan_df.verification_status.value_counts()

In [None]:
for i in loan_df._get_numeric_data().columns:
    loan_df[i] = loan_df[i].fillna(loan_df[i].median())

In [None]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [None]:
tri_upper_diag = np.triu(a, k=0)
tri_upper_diag

In [None]:
tri_lower_diag = np.tril(a, k=0)
tri_lower_diag

# EDA - on Leanding club data


In [None]:
cor = loan_df.corr()
cor

In [None]:
#highly corelated attributes
cor = loan_df.corr()
#cor.loc[:,:] = np.tril(cor, k=-1) 
cor = cor.stack()
cor[(cor > 0.55) | (cor < -0.55)]

In [None]:
loan_df.drop(['total_acc'], axis=1, inplace=True)

In [None]:
loan_df.pymnt_plan.unique().shape

In [None]:
loan_df.pymnt_plan.value_counts()

In [None]:
# droping pymnt_plan, it is identity attribute
loan_df.drop(['pymnt_plan'],1, inplace=True)

In [None]:
loan_df.describe(include=['object'])

In [None]:
len(loan_df.columns)

In [None]:
#correlation Matrix
loan_cor = loan_df.corr()

In [None]:
#heat map for correlation
mask = np.zeros_like(loan_cor, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(10, 9))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(loan_cor, mask=mask,
            vmax=.3,
            cmap=cmap,
            square=True,
            linewidths=.5,
            cbar_kws={"shrink": .5})

In [None]:
#analysing loan purpose
purpose = loan_df.purpose_cat.value_counts()
purpose.plot.barh(figsize =(10,5),color ="blue")

In [None]:
dist =sns.countplot(data=loan_df,hue='is_bad',x='purpose_cat')
dist.set_xticklabels(dist.get_xticklabels(), rotation=40, ha="right")
plt.title('Loan distribution')
plt.xlabel('purpose',)
plt.ylabel('status')

In [None]:
dist =sns.countplot(data=loan_df,hue='is_bad',x='verification_status')
dist.set_xticklabels(dist.get_xticklabels(), rotation=40, ha="right")
plt.title('Loan distribution')
plt.xlabel('purpose',)
plt.ylabel('status')

In [None]:
#analysing bad and not bad loan
is_bad_loan = loan_df[loan_df.is_bad == 1]
not_bad_loan = loan_df[loan_df.is_bad == 0]

In [None]:
print(not_bad_data.shape)
print(is_bad_data.shape)
print(loan_df['is_bad'].shape)

In [None]:
is_bad_loan.head()

In [None]:
plt.scatter(x= is_bad_loan['annual_inc'], y= is_bad_data['debt_to_income'])

In [None]:
plt.scatter(x= not_bad_data['annual_inc'], y= not_bad_data['debt_to_income'])

In [None]:
sns.FacetGrid(loan_df, hue="is_bad",size=5) \
   .map(plt.scatter, "annual_inc", "debt_to_income") \
   .add_legend(title = 'Staus', labels = ['Approved','Rejected'])

In [None]:
loan_df.head()

In [None]:
loan_df.home_ownership.value_counts()

In [None]:
loan_df.addr_state.value_counts()

In [None]:
loan_df.policy_code.value_counts()

In [None]:
# Droping policy_code, not relevent for investigation
loan_df.drop(['policy_code'],1, inplace=True)

In [None]:
# Droping addr_state, not relevent for investigation
loan_df.drop(['addr_state'],1, inplace=True)

In [None]:
#dealing with categorical attributes

# Create dummy variables from the feature purpose_cat
loan_df = pd.get_dummies(loan_df, columns=["purpose_cat"], drop_first=True)

# Create dummy variables from the feature home_ownership 
loan_df = pd.get_dummies(loan_df, columns=["home_ownership"], drop_first=True)

# Create dummy variables from the feature verification_status
loan_df = pd.get_dummies(loan_df, columns=["verification_status"], drop_first=True)

In [None]:
loan_df.head()

In [None]:
my_list = list(loan_df)
Independent = my_list[1:]

In [None]:
X = loan_df[Independent]

In [None]:
depended = ['is_bad']

In [None]:
Y = loan_df[depended]
Y.shape

In [None]:
X.shape

In [None]:
loan_df.head()

# Modeling and Feature Selection

## Randon Forest

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()

X_train[num]= sc.fit_transform(X_train[num])
X_test[num] = sc.transform(X_test[num])

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
rf_classifier = RandomForestClassifier(criterion = 'entropy',
                                                   max_depth= 4,
                                                   max_features ='log2',
                                                   n_estimators = 100,
                                                   min_samples_split = 6,
                                                   class_weight = {0: 1, 1: 5}
                                                  )

In [None]:
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred_train_rf= rf_classifier.predict(X_train)

In [None]:
print('Precision', precision_score(y_train, y_pred_train_rf))
print('Accuracy', accuracy_score(y_train, y_pred_train_rf))
print('F1 Score', f1_score(y_train, y_pred_train_rf))
print('Recall', recall_score(y_train, y_pred_train_rf))

In [None]:
y_pred_test_rf = rf_classifier.predict(X_test)

In [None]:
print('Precision', precision_score(y_test, y_pred_test_rf))
print('Accuracy', accuracy_score(y_test, y_pred_test_rf))
print('F1 Score', f1_score(y_test, y_pred_test_rf))
print('Recall', recall_score(y_test, y_pred_test_rf))

In [None]:
cm_rf = confusion_matrix(y_test,y_pred_test)
print(cm_rf)

## SVM

In [None]:
svm_classifier = SVC(probability = True)
svm_classifier.fit(X_train,y_train)

In [None]:
y_pred_train_svm= svm_classifier.predict(X_train)

In [None]:
print('Precision', precision_score(y_train, y_pred_train_svm))
print('Accuracy', accuracy_score(y_train, y_pred_train_svm))
print('F1 Score', f1_score(y_train, y_pred_train_svm))
print('Recall', recall_score(y_train, y_pred_train_svm))

In [None]:
y_pred_test_svm = svm_classifier.predict(X_test)

In [None]:
print('Precision', precision_score(y_test, y_pred_test_svm))
print('Accuracy', accuracy_score(y_test, y_pred_test_svm))
print('F1 Score', f1_score(y_test, y_pred_test_svm))
print('Recall', recall_score(y_test, y_pred_test_svm))

In [None]:
cm_svm = confusion_matrix(y_test,y_pred_test_svm)
print(cm_svm)

In [None]:
import pandas as pd
feature_importances = pd.DataFrame(rf_classifier.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',     
                                                                        ascending=False)

In [None]:
sns.barplot(x=feature_importances[:10].importance, y=feature_importances[:10].index)

In [None]:
y_scores_sm= svm_classifier.predict_proba(X_test.values)
y_scores_rf = rf_classifier.predict_proba(X_test.values)

In [None]:
fpr_rf, tpr_rf, thresholds_rf= roc_curve(y_test, y_scores_rf[:,1])
fpr_svm, tpr_svm, thresholds_svm= roc_curve(y_test, y_scores_sm[:,1])

In [None]:
plt.figure(figsize=(10, 10))

plt.plot(fpr_rf, tpr_rf, label= 'Random Forest')
plt.plot(fpr_svm, tpr_svm, label= 'SVM')

plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

In [None]:
y_scores_sm_train = svm_classifier.predict_proba(X_train.values)
y_scores_rf_train = rf_classifier.predict_proba(X_train.values)

In [None]:
fpr_rf_train, tpr_rf_train, thresholds_rf_train= roc_curve(y_train, y_scores_rf_train[:,1])
fpr_svm_train, tpr_svm_train, thresholds_svm_train= roc_curve(y_train, y_scores_sm_train[:,1])

In [None]:
plt.figure(figsize=(10, 10))

plt.plot(fpr_rf_train, tpr_rf_train, label= 'Random Forest')
plt.plot(fpr_svm_train, tpr_svm_train, label= 'SVM')

plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

# Saving a Model

In [None]:
import pickle
dict_objects = {'Standard Scaling':sc, 
                'Random Forest Classifier': rf_classifier,
                'SVM Classifier': svm_classifier
                }

In [None]:
filename = 'ModelPickles.pkl'
outfile = open(filename,'wb')

In [None]:
pickle.dump(dict_objects,outfile)
outfile.close()

In [None]:
import googleapiclient.discovery

def predict_json(project, model, instances, version=None):
    """Send json data to a deployed model for prediction.
    Args:
        project (str): project where the AI Platform Prediction Model is deployed.
        model (str): model name.
        instances ([[float]]): List of input instances, where each input
           instance is a list of floats.
        version: str, version of the model to target.
    Returns:
        Mapping[str: any]: dictionary of prediction results defined by the
            model.
    """
    # Create the AI Platform Prediction service object.
    # To authenticate set the environment variable
    # GOOGLE_APPLICATION_CREDENTIALS=<path_to_service_account_file>
    service = googleapiclient.discovery.build('ml', 'v1')
    name = 'projects/{}/models/{}'.format(project, model)

    if version is not None:
        name += '/versions/{}'.format(version)

    response = service.projects().predict(
        name=name,
        body={'instances': instances}
    ).execute()
   
    if 'error' in response:
        raise RuntimeError(response['error'])

    return response['predictions']

In [None]:
import json
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'xxxxxxxxxxxxx.json'

test = X_test.iloc[20:21].values
test.shape
test.reshape(1, -1)
instances = test.tolist()
predict_json('mwpmltr', 'lending_club_model', instances)