In [3]:
import pandas as pd
import sweetviz as sv
import numpy as np
from sklearn.model_selection import train_test_split
# read_date

# increase column number display in pandas
pd.set_option('display.max_columns', 100)

def read_csv():
    # read csv and parse dates column to datetime
    df = pd.read_csv("loan.csv", parse_dates=['issue_d'])
    # delete columns which has more than 50% missing values
    df = df.dropna(thresh=len(df)/2, axis=1)
    # check columns which has same values of 90% data and delete it.
    for col in df.columns:
        if df[col].value_counts().iloc[0] > len(df)*0.9:
            df = df.drop(col, axis=1)
    return df

def make_train_test(df):
    # split data to train and test
    sorted_df = df[df['loan_status'] != 'Current'].reset_index(drop=True)
    sorted_df["loan_status"] = sorted_df["loan_status"].map({"Fully Paid": 0, "Charged Off": 1})
    X = sorted_df.drop('loan_status', axis=1)
    y = sorted_df['loan_status']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train.reset_index(drop=True,inplace=True)
    X_test.reset_index(drop=True,inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    return X_train, X_test, y_train, y_test

df = read_csv()
X_train, X_test, y_train, y_test = make_train_test(df)

# # check data by sweetviz
# my_report = sv.analyze(df)
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

X_train.head()

  df = read_csv()


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,revol_bal,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,recoveries,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d
0,482059,613124,4200,4200,4075.0,36 months,7.51%,130.66,A,A4,Pentegra Retirement Services,2 years,RENT,40000.0,Not Verified,Feb-10,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 02/02/10 > I could easily ...,major_purchase,Full Return plus Interest,105xx,NY,15.57,0,Sep-03,2,6,1801,62.10%,12,4703.910242,4563.91,4200.0,503.91,0.0,Feb-13,148.47,Sep-13
1,600497,770719,8000,8000,7996.782656,36 months,5.42%,241.28,A,A1,Kendall College/Laureate University,1 year,MORTGAGE,53000.0,Not Verified,Oct-10,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 10/21/10 > I will set up p...,credit_card,Debt FREE,605xx,IL,8.85,0,Oct-98,0,15,10327,20.10%,37,8687.009702,8683.26,8000.0,687.01,0.0,Nov-13,263.43,Nov-13
2,565437,727473,13750,13750,13725.0,36 months,10.75%,448.54,B,B2,resurrection healthcare,4 years,MORTGAGE,70000.0,Verified,Aug-10,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 08/16/10 > This loan will ...,debt_consolidation,debt consalidation,600xx,IL,3.69,0,Aug-04,3,14,10220,23.90%,18,16147.60121,16118.24,13750.0,2397.6,0.0,Sep-13,468.95,Aug-13
3,875270,1089782,8000,8000,8000.0,36 months,6.99%,246.99,A,A3,University of Akron,10+ years,MORTGAGE,45000.0,Not Verified,Sep-11,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 09/06/11 > new roof<br/>null,home_improvement,roof,443xx,OH,4.32,0,Sep-92,1,8,4574,22.10%,24,8891.25,8891.25,8000.0,891.25,0.0,Sep-14,251.71,Sep-14
4,682411,871617,15250,15250,15250.0,60 months,13.43%,350.36,C,C3,,10+ years,RENT,40000.0,Not Verified,Mar-11,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 03/14/11 > Loan will be us...,credit_card,Business Loan,916xx,CA,14.97,0,Sep-89,1,5,6963,35.70%,18,21020.79004,21020.79,15250.0,5770.79,0.0,Apr-16,349.55,Mar-16


In [4]:
class base():
    def fit(base, input_df):
        return base.transform(input_df)
        
    def transform():
        raise NotImplementedError

In [5]:
class preprocess_features(base):
    def transform_term(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_term'] = input_df['term'].str.replace('months', '').astype(int)
        return output_df

    def transform_int_rate(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_int_rate'] = input_df['int_rate'].str.replace('%', '').astype(float)
        return output_df
    
    def transform_revol_util(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_revol_util'] = input_df['revol_util'].str.replace('%', '').astype(float)
        return output_df
    
    def transform(self, input_df):
        input_df["term"] = self.transform_term(input_df)
        input_df["int_rate"] = self.transform_int_rate(input_df)
        input_df["revol_util"] = self.transform_revol_util(input_df)
        return input_df

preprocess = preprocess_features()
X_train = preprocess.fit(X_train)
X_test = preprocess.transform(X_test)
X_train.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,revol_bal,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,recoveries,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d
0,482059,613124,4200,4200,4075.0,36,7.51,130.66,A,A4,Pentegra Retirement Services,2 years,RENT,40000.0,Not Verified,Feb-10,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 02/02/10 > I could easily ...,major_purchase,Full Return plus Interest,105xx,NY,15.57,0,Sep-03,2,6,1801,62.1,12,4703.910242,4563.91,4200.0,503.91,0.0,Feb-13,148.47,Sep-13
1,600497,770719,8000,8000,7996.782656,36,5.42,241.28,A,A1,Kendall College/Laureate University,1 year,MORTGAGE,53000.0,Not Verified,Oct-10,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 10/21/10 > I will set up p...,credit_card,Debt FREE,605xx,IL,8.85,0,Oct-98,0,15,10327,20.1,37,8687.009702,8683.26,8000.0,687.01,0.0,Nov-13,263.43,Nov-13
2,565437,727473,13750,13750,13725.0,36,10.75,448.54,B,B2,resurrection healthcare,4 years,MORTGAGE,70000.0,Verified,Aug-10,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 08/16/10 > This loan will ...,debt_consolidation,debt consalidation,600xx,IL,3.69,0,Aug-04,3,14,10220,23.9,18,16147.60121,16118.24,13750.0,2397.6,0.0,Sep-13,468.95,Aug-13
3,875270,1089782,8000,8000,8000.0,36,6.99,246.99,A,A3,University of Akron,10+ years,MORTGAGE,45000.0,Not Verified,Sep-11,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 09/06/11 > new roof<br/>null,home_improvement,roof,443xx,OH,4.32,0,Sep-92,1,8,4574,22.1,24,8891.25,8891.25,8000.0,891.25,0.0,Sep-14,251.71,Sep-14
4,682411,871617,15250,15250,15250.0,60,13.43,350.36,C,C3,,10+ years,RENT,40000.0,Not Verified,Mar-11,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 03/14/11 > Loan will be us...,credit_card,Business Loan,916xx,CA,14.97,0,Sep-89,1,5,6963,35.7,18,21020.79004,21020.79,15250.0,5770.79,0.0,Apr-16,349.55,Mar-16


In [6]:
import category_encoders as ce

class ordinal_encoding_based_on_frequency_encoding(base):
    def __init__(self,col):
        self.col = col
    
    def fit(self, input_df):
        output_df = pd.DataFrame()
        value_counted = input_df[self.col].value_counts().sort_values()
        mapping_dict= {value:i for i,value in enumerate(value_counted.index)}
        mapping = [{"col":self.col,"mapping":mapping_dict}]
        self.encoder_ = ce.OrdinalEncoder(mapping=mapping)
        output_df = self.encoder_.fit_transform(input_df[self.col])
        return output_df.add_prefix(f"ordinal_")
    
    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df = self.encoder_.transform(input_df[self.col])
        return output_df.add_prefix(f"ordinal_")

test = ordinal_encoding_based_on_frequency_encoding("grade")
test.fit(X_train)
test.transform(X_test)

Unnamed: 0,ordinal_grade
0,4
1,4
2,4
3,6
4,5
...,...
7711,5
7712,5
7713,4
7714,3


In [67]:
# stacked predictionによりテキストをメタ特徴量化する。

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import Binarizer
from scipy.sparse import hstack
from sklearn.linear_model import ElasticNet,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

class text_out_of_fold_prediction(base):
  def __init__(self, col):
    self.nfolds = 5
    self.seed = 71
    self.col = col
    self.models = []
    self.regression = False
    self.scores = []
    

    
  def fit(self, input_df, y):
    x = input_df[self.col].fillna("nan")
    # self.vectorizer_ = TfidfVectorizer(max_features=100)
    self.vectorizer_ = CountVectorizer(max_features=100)
    x = self.vectorizer_.fit_transform(x)
    self.binerizer_ = Binarizer()
    x = self.binerizer_.fit_transform(x)
    
    if self.regression:
      fold = KFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
    else:
      fold = StratifiedKFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
    oof_train = np.zeros(len(input_df))
    # stacked prediction
    for i, (train_idx, valid_idx) in enumerate(fold.split(input_df[self.col], y)):
      train_x, train_y = x[train_idx], y.loc[train_idx]
      valid_x, valid_y = x[valid_idx], y.loc[valid_idx]
      
      if self.regression:
        clf = ElasticNet(random_state=self.seed)
        clf.fit(train_x, train_y)
        pred_y = clf.predict(valid_x)
        score = mean_squared_error(valid_y, pred_y)
      else:
        clf = LogisticRegression(penalty='elasticnet',solver='saga',random_state=self.seed, max_iter=10000, C=1.0, l1_ratio=0.5, class_weight='balanced')
        clf.fit(train_x, train_y)
        pred_y = clf.predict_proba(valid_x)[:,1]
        score = roc_auc_score(valid_y, pred_y)  
      print(f'CV Score of Fold_{i} is {score}')
      self.models.append(clf)
      self.scores.append(score)
      oof_train[valid_idx]= pred_y
      
    print(f"mean score is {np.mean(self.scores)}")
    output_df = pd.DataFrame()
    output_df[self.col] = oof_train
    return output_df.add_prefix("predicted_")
        
  def transform(self, input_df):
    x = input_df[self.col].fillna("nan")
    x = self.vectorizer_.transform(x)
    x = self.binerizer_.transform(x)
    oof_test = np.zeros(len(input_df))
    if self.regression:
      for clf in self.models:
        oof_test += clf.predict(x)
    else:
      for clf in self.models:
        oof_test += clf.predict_proba(x)[:,1]
    oof_test /= self.nfolds
    output_df = pd.DataFrame()
    output_df[self.col] = oof_test
    return output_df.add_prefix("predicted_")

test = text_out_of_fold_prediction("desc")
test.fit(X_train, y_train)
test.transform(X_test)

CV Score of Fold_0 is 0.5863281706425587
CV Score of Fold_1 is 0.5765773842736108
CV Score of Fold_2 is 0.5616805194975052
CV Score of Fold_3 is 0.5680024183475456
CV Score of Fold_4 is 0.5740310946672104
mean score is 0.5733239174856861


Unnamed: 0,predicted_desc
0,0.500638
1,0.706604
2,0.473696
3,0.411770
4,0.295234
...,...
7711,0.496723
7712,0.539978
7713,0.496723
7714,0.620091


In [75]:
# テキストの重要堂やカウントを整理する
class text_check_insight():
    def __init__(self, col):
        self.col = col
        self.regression =False
        self.seed = 1
 
    def create_words_summary(self, clf, x):
        output_df = pd.DataFrame()
        output_df["features"] = self.vectorizer_.get_feature_names()
        output_df["wordcounts"] = (np.array(x.sum(axis=0)).flatten())
        output_df["coefs"] = clf.coef_.flatten()
        return output_df.sort_values(by="coefs").reset_index(drop=True)
    
    def __call__(self, input_df, y):
        x = input_df[self.col].fillna("nan")
        self.vectorizer_ = CountVectorizer(max_features=100)
        x = self.vectorizer_.fit_transform(x)
        self.binerizer_ = Binarizer()
        x = self.binerizer_.fit_transform(x)
        
        if self.regression:
            clf = ElasticNet(random_state=self.seed)
        else:
            clf = LogisticRegression(penalty='elasticnet',solver='saga',random_state=self.seed, max_iter=10000, C=1.0, l1_ratio=0.5, class_weight='balanced')
        clf.fit(x, y)
        return self.create_words_summary(clf, x)
    
test = text_check_insight("desc")
test(X_train, y_train)

Unnamed: 0,features,wordcounts,coefs
0,than,2556,-0.307758
1,rate,3315,-0.281728
2,card,5604,-0.211295
3,on,18096,-0.200690
4,credit,11082,-0.179713
...,...,...,...
95,one,3718,0.249953
96,thank,3789,0.284044
97,borrower,15674,0.296310
98,bills,2979,0.379548


In [None]:
feature_blocks = [*[ordinal_encoding_based_on_frequency_encoding(col) for col in
                    ["grade", "sub_grade"]]]

def run_blocks(input_df,blocks, test=False):
    output_df = pd.DataFrame()
    use_original_columns = ["loan_amnt","funded_amnt","funded_amnt_inv","term",
                            "int_rate","installment"]
    output_df = input_df[use_original_columns]
    for block in blocks:
        if test:
            output_i = block.transform(input_df)
        else:
            output_i = block.fit(input_df)
        assert len(input_df) == len(output_i), block
        output_df = pd.concat([output_df,output_i],axis=1)
    return output_df


run_blocks(X_train, feature_blocks)
