In [1]:
import pandas as pd
import sweetviz as sv
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

# increase column number display in pandas
pd.set_option('display.max_columns', 100)

def read_csv():
    # read csv and parse dates column to datetime
    df = pd.read_csv("loan.csv", parse_dates=['issue_d'])

    return df

def make_train_test(df):
    # split data to train and test
    sorted_df = df[df['loan_status'] != 'Current'].reset_index(drop=True)
    sorted_df["loan_status"] = sorted_df["loan_status"].map({"Fully Paid": 0, "Charged Off": 1})
    X = sorted_df.drop('loan_status', axis=1)
    y = sorted_df['loan_status']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train.reset_index(drop=True,inplace=True)
    X_test.reset_index(drop=True,inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    return X_train, X_test, y_train, y_test
    

df = read_csv()
X_train, X_test, y_train, y_test = make_train_test(df)

# # check data by sweetviz
# my_report = sv.analyze(X_train)
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

X_train.head()



Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,...,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1004841,1231304,35000,35000,34972.8295,60 months,14.27%,819.3,C,C2,,5 years,MORTGAGE,272000.0,Verified,Nov-11,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 11/04/11 > I own a profita...,debt_consolidation,debt_consolidation,068xx,CT,8.54,0,Nov-92,2,,,11,0,9968,49.30%,35,f,0.0,0.0,48716.42994,48663.78,35000.0,13716.43,0.0,0.0,0.0,Jan-16,8570.73,,Feb-16,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
1,471360,595085,11500,11500,11500.0,36 months,7.05%,355.35,A,A1,"Tyson Foods, Inc.",1 year,RENT,74500.0,Not Verified,Jan-10,n,https://lendingclub.com/browse/loanDetail.acti...,,car,Vehicle,727xx,AR,16.38,0,Jan-98,1,25.0,,15,0,8813,67.60%,40,f,0.0,0.0,12580.19992,12580.2,11500.0,1080.2,0.0,0.0,0.0,Nov-11,5124.43,,Dec-11,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
2,687534,877555,12700,12700,12676.7042,60 months,10.74%,274.49,B,B4,,2 years,MORTGAGE,52000.0,Verified,Mar-11,n,https://lendingclub.com/browse/loanDetail.acti...,,small_business,Secure Property I,321xx,FL,1.87,0,Aug-93,1,,,10,0,3054,4.50%,25,f,0.0,0.0,8134.09,8095.21,5088.7,2592.26,0.0,453.13,4.5313,Jul-13,274.49,,Dec-13,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
3,403806,449570,3000,3000,3000.0,36 months,9.63%,96.29,A,A5,metlife,2 years,RENT,45000.0,Not Verified,May-09,n,https://lendingclub.com/browse/loanDetail.acti...,looking to buy my car off the lease for aroun...,car,car and personal loan,110xx,NY,5.36,0,Feb-03,2,,,4,0,10222,75.30%,6,f,0.0,0.0,3466.141038,3466.14,3000.0,466.14,0.0,0.0,0.0,Jun-12,103.01,,Jun-15,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
4,1068744,1303379,7000,7000,7000.0,36 months,12.69%,234.82,B,B5,Bellflower convalecent hospital,10+ years,RENT,35000.0,Source Verified,Dec-11,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/14/11 > Pay my credit<br>,credit_card,I pay my all credit,902xx,CA,11.35,0,May-00,2,53.0,,10,0,7649,82.30%,26,f,0.0,0.0,8453.260001,8453.26,7000.0,1453.26,0.0,0.0,0.0,Jan-15,248.19,,Nov-15,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,


In [2]:
class Base():    
    def fit(self, input_df):
        return self.transform(input_df)
        
    def transform(self, input_df):
        raise NotImplementedError

In [3]:
class DeleteUninformableColumn(Base):
    def __init__(self):
        self.deleted_columns = []
        
    def fit(self, input_df):
        # delete columns which has more than 50% missing values
        origin_columns = set(input_df.columns)
        deleted_df = input_df.dropna(thresh=len(df)/2, axis=1)
        
        # check columns which has same values of 90% data and delete it.
        for col in deleted_df.columns:
            if deleted_df[col].value_counts().iloc[0] > len(deleted_df)*0.9:
                output_df = deleted_df.drop(col, axis=1)
        self.deleted_columns=output_df.columns
        return Base.transform(input_df)
    
    def transform(self, input_df):
        output_df = input_df.drop(self.deleted_columns, axis=1)
        return output_df
    



In [4]:
class PreprocessFeatures(Base):
    def transform_term(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_term'] = input_df['term'].str.replace('months', '').astype(int)
        return output_df

    def transform_int_rate(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_int_rate'] = input_df['int_rate'].str.replace('%', '').astype(float)
        return output_df
    
    def transform_revol_util(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_revol_util'] = input_df['revol_util'].str.replace('%', '').astype(float)
        return output_df
    
    def transform(self, input_df):
        input_df["term"] = self.transform_term(input_df)
        input_df["int_rate"] = self.transform_int_rate(input_df)
        input_df["revol_util"] = self.transform_revol_util(input_df)
        return input_df

preprocess = PreprocessFeatures()
X_train = preprocess.fit(X_train)
X_test = preprocess.transform(X_test)
X_train.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,...,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1004841,1231304,35000,35000,34972.8295,60,14.27,819.3,C,C2,,5 years,MORTGAGE,272000.0,Verified,Nov-11,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 11/04/11 > I own a profita...,debt_consolidation,debt_consolidation,068xx,CT,8.54,0,Nov-92,2,,,11,0,9968,49.3,35,f,0.0,0.0,48716.42994,48663.78,35000.0,13716.43,0.0,0.0,0.0,Jan-16,8570.73,,Feb-16,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
1,471360,595085,11500,11500,11500.0,36,7.05,355.35,A,A1,"Tyson Foods, Inc.",1 year,RENT,74500.0,Not Verified,Jan-10,n,https://lendingclub.com/browse/loanDetail.acti...,,car,Vehicle,727xx,AR,16.38,0,Jan-98,1,25.0,,15,0,8813,67.6,40,f,0.0,0.0,12580.19992,12580.2,11500.0,1080.2,0.0,0.0,0.0,Nov-11,5124.43,,Dec-11,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
2,687534,877555,12700,12700,12676.7042,60,10.74,274.49,B,B4,,2 years,MORTGAGE,52000.0,Verified,Mar-11,n,https://lendingclub.com/browse/loanDetail.acti...,,small_business,Secure Property I,321xx,FL,1.87,0,Aug-93,1,,,10,0,3054,4.5,25,f,0.0,0.0,8134.09,8095.21,5088.7,2592.26,0.0,453.13,4.5313,Jul-13,274.49,,Dec-13,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
3,403806,449570,3000,3000,3000.0,36,9.63,96.29,A,A5,metlife,2 years,RENT,45000.0,Not Verified,May-09,n,https://lendingclub.com/browse/loanDetail.acti...,looking to buy my car off the lease for aroun...,car,car and personal loan,110xx,NY,5.36,0,Feb-03,2,,,4,0,10222,75.3,6,f,0.0,0.0,3466.141038,3466.14,3000.0,466.14,0.0,0.0,0.0,Jun-12,103.01,,Jun-15,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,
4,1068744,1303379,7000,7000,7000.0,36,12.69,234.82,B,B5,Bellflower convalecent hospital,10+ years,RENT,35000.0,Source Verified,Dec-11,n,https://lendingclub.com/browse/loanDetail.acti...,Borrower added on 12/14/11 > Pay my credit<br>,credit_card,I pay my all credit,902xx,CA,11.35,0,May-00,2,53.0,,10,0,7649,82.3,26,f,0.0,0.0,8453.260001,8453.26,7000.0,1453.26,0.0,0.0,0.0,Jan-15,248.19,,Nov-15,0.0,,...,,,,,,,,,,,,,,,,,,0.0,0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,


In [5]:
import category_encoders as ce

class ordinal_encoding_based_on_frequency_encoding(Base):
    def __init__(self,col):
        self.col = col
    
    def fit(self, input_df):
        output_df = pd.DataFrame()
        value_counted = input_df[self.col].value_counts().sort_values()
        mapping_dict= {value:i for i,value in enumerate(value_counted.index)}
        mapping = [{"col":self.col,"mapping":mapping_dict}]
        self.encoder_ = ce.OrdinalEncoder(mapping=mapping)
        output_df = self.encoder_.fit_transform(input_df[self.col])
        return output_df.add_prefix(f"ordinal_")
    
    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df = self.encoder_.transform(input_df[self.col])
        return output_df.add_prefix(f"ordinal_")

# test = ordinal_encoding_based_on_frequency_encoding("grade")
# test.fit(X_train)
# test.transform(X_test)

In [6]:
# stacked predictionによりテキストをメタ特徴量化する。
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import Binarizer
from scipy.sparse import hstack
from sklearn.linear_model import ElasticNet,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

class text_out_of_fold_prediction(Base):
  def __init__(self, col):
    self.nfolds = 5
    self.seed = 71
    self.col = col
    self.models = []
    self.regression = False
    self.scores = []
    

    
  def fit(self, input_df, y):
    x = input_df[self.col].fillna("nan")
    # self.vectorizer_ = TfidfVectorizer(max_features=100)
    self.vectorizer_ = CountVectorizer(max_features=100)
    x = self.vectorizer_.fit_transform(x)
    self.binerizer_ = Binarizer()
    x = self.binerizer_.fit_transform(x)
    
    if self.regression:
      fold = KFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
    else:
      fold = StratifiedKFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
    oof_train = np.zeros(len(input_df))
    # stacked prediction
    for i, (train_idx, valid_idx) in enumerate(fold.split(input_df[self.col], y)):
      train_x, train_y = x[train_idx], y.loc[train_idx]
      valid_x, valid_y = x[valid_idx], y.loc[valid_idx]
      
      if self.regression:
        clf = ElasticNet(random_state=self.seed)
        clf.fit(train_x, train_y)
        pred_y = clf.predict(valid_x)
        score = mean_squared_error(valid_y, pred_y)
      else:
        clf = LogisticRegression(penalty='elasticnet',solver='saga',random_state=self.seed, max_iter=10000, C=1.0, l1_ratio=0.5, class_weight='balanced')
        clf.fit(train_x, train_y)
        pred_y = clf.predict_proba(valid_x)[:,1]
        score = roc_auc_score(valid_y, pred_y)  
      print(f'CV Score of Fold_{i} is {score}')
      self.models.append(clf)
      self.scores.append(score)
      oof_train[valid_idx]= pred_y
      
    print(f"mean score is {np.mean(self.scores)}")
    output_df = pd.DataFrame()
    output_df[self.col] = oof_train
    return output_df.add_prefix("predicted_")
        
  def transform(self, input_df):
    x = input_df[self.col].fillna("nan")
    x = self.vectorizer_.transform(x)
    x = self.binerizer_.transform(x)
    oof_test = np.zeros(len(input_df))
    if self.regression:
      for clf in self.models:
        oof_test += clf.predict(x)
    else:
      for clf in self.models:
        oof_test += clf.predict_proba(x)[:,1]
    oof_test /= self.nfolds
    output_df = pd.DataFrame()
    output_df[self.col] = oof_test
    return output_df.add_prefix("predicted_")

test = text_out_of_fold_prediction("desc")
test.fit(X_train, y_train)
test.transform(X_test)

CV Score of Fold_0 is 0.5817684809279406
CV Score of Fold_1 is 0.5767008600027248
CV Score of Fold_2 is 0.5745392195816912
CV Score of Fold_3 is 0.5574628592585736
CV Score of Fold_4 is 0.5889656971436025
mean score is 0.5758874233829065


Unnamed: 0,predicted_desc
0,0.642076
1,0.494648
2,0.787510
3,0.515681
4,0.494648
...,...
7711,0.494648
7712,0.540274
7713,0.344810
7714,0.430151


In [7]:
# テキストの重要堂やカウントを整理する
class text_check_insight():
    def __init__(self, col):
        self.col = col
        self.regression =False
        self.seed = 1
 
    def create_words_summary(self, clf, x):
        output_df = pd.DataFrame()
        output_df["features"] = self.vectorizer_.get_feature_names_out()
        output_df["wordcounts"] = (np.array(x.sum(axis=0)).flatten())
        output_df["coefs"] = clf.coef_.flatten()
        return output_df.sort_values(by="coefs").reset_index(drop=True)
    
    def __call__(self, input_df, y):
        x = input_df[self.col].fillna("nan")
        self.vectorizer_ = CountVectorizer(max_features=1000)
        x = self.vectorizer_.fit_transform(x)
        self.binerizer_ = Binarizer()
        x = self.binerizer_.fit_transform(x)
        
        if self.regression:
            clf = ElasticNet(random_state=self.seed)
        else:
            clf = LogisticRegression(penalty='elasticnet',solver='saga',random_state=self.seed, max_iter=10000, C=1.0, l1_ratio=0.5, class_weight='balanced')
        clf.fit(x, y)
        return self.create_words_summary(clf, x)

    
test = text_check_insight("desc")
df_word_summary = test(X_train, y_train)
df_word_summary

Unnamed: 0,features,wordcounts,coefs
0,toward,224,-1.394995
1,engagement,140,-1.380161
2,motorcycle,258,-1.099541
3,review,119,-1.054282
4,carry,122,-1.016636
...,...,...,...
995,cars,132,0.771123
996,today,154,0.860826
997,basis,113,0.895796
998,decent,124,0.971027


In [8]:
# ワードクラウドで可視化する。
# 閾値を設けてある一定の頻度のものだけ表示にしてもいいかも
# 今後Stopword除去も配慮する
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Create a dictionary where keys are words and values are corresponding wordcounts
wordcounts = dict(zip(df['features'], df['wordcounts']))

# Define a color function
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    coef = df[df['features'] == word]['coefs'].values[0]
    if coef > 0:
        return "hsl(0, 70%%, %d%%)" % (50 + coef * 100) # adjust these numbers to change color
    else:
        return "hsl(200, 70%%, %d%%)" % (50 - coef * 100) # adjust these numbers to change color

wordcloud = WordCloud(width=800, height=400, color_func=color_func,
                      prefer_horizontal=1.0).generate_from_frequencies(wordcounts)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

KeyError: 'features'

In [None]:
# making all features
class RunBlocks(Base):
    def __init__(self):
        self.feature_blocks = [*[ordinal_encoding_based_on_frequency_encoding(col) for col in
                    ["grade", "sub_grade","emp_length","home_ownership","verification_status",
                     "purpose","zip_code","addr_state",]]]
        self.stacked_predict_feature = [*[text_out_of_fold_prediction(col) for col in ["desc","title"]]]
        self.use_original_values = ["loan_amnt","funded_amnt","funded_amnt_inv","term",
                                "int_rate","installment","annual_inc"]
        
    def fit(self,input_df, df_y):
        output_df = pd.DataFrame()
        output_df = input_df[self.use_original_values]
                
        for block in self.feature_blocks:
            output_i = block.fit(input_df)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
        
        for block in self.stacked_predict_feature:
            output_i = block.fit(input_df,df_y)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
        return output_df
    
    def transform(self,input_df):
        output_df = pd.DataFrame()
        output_df = input_df[self.use_original_values]
        
        for block in self.feature_blocks:
            output_i = block.transform(input_df)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
            
        for block in self.stacked_predict_feature:
            output_i = block.transform(input_df)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
        return output_df
        
run_blocks = RunBlocks()
run_blocks.fit(X_train, y_train)
run_blocks.transform(X_test)

