In [None]:
import pandas as pd
import sweetviz as sv
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

# increase column number display in pandas
pd.set_option('display.max_columns', 100)

def read_csv():
    # read csv and parse dates column to datetime
    df = pd.read_csv("loan.csv", parse_dates=['issue_d'])
    return df

def make_train_test(df):
    # split data to train and test
    sorted_df = df[df['loan_status'] != 'Current'].reset_index(drop=True)
    sorted_df["loan_status"] = sorted_df["loan_status"].map({"Fully Paid": 0, "Charged Off": 1})
    X = sorted_df.drop('loan_status', axis=1)
    y = sorted_df['loan_status']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train.reset_index(drop=True,inplace=True)
    X_test.reset_index(drop=True,inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    return X_train, X_test, y_train, y_test
    

df = read_csv()
X_train, X_test, y_train, y_test = make_train_test(df)

# # check data by sweetviz
# my_report = sv.analyze(X_train)
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

X_train.head()

In [None]:
class Base():    
    def fit(self, input_df):
        return self.transform(input_df)
        
    def transform(self, input_df):
        raise NotImplementedError

In [None]:
class DeleteUninformableColumn(Base):
    def __init__(self):
        self.deleted_columns = []
        
    def fit(self, input_df):
        # delete columns which has more than 50% missing values
        origin_columns = set(input_df.columns)
        deleted_df = input_df.dropna(thresh=len(df)/2, axis=1)
        
        # check columns which has same values of 90% data and delete it.
        for col in deleted_df.columns:
            if deleted_df[col].value_counts().iloc[0] > len(deleted_df)*0.9:
                output_df = deleted_df.drop(col, axis=1)
        self.deleted_columns=set(output_df.columns)-origin_columns
        print(f"delete columns: {self.deleted_columns}")
        return self.transform(input_df)
    
    def transform(self, input_df):
        output_df = input_df.drop(self.deleted_columns, axis=1)
        return output_df
    
delete_colums = DeleteUninformableColumn()
X_train = delete_colums.fit(X_train)
X_test = delete_colums.transform(X_test)
    



In [None]:
class PreprocessFeatures(Base):
    def transform_term(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_term'] = input_df['term'].str.replace('months', '').astype(int)
        return output_df

    def transform_int_rate(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_int_rate'] = input_df['int_rate'].str.replace('%', '').astype(float)
        return output_df
    
    def transform_revol_util(self, input_df):
        output_df = pd.DataFrame()
        output_df['converted_revol_util'] = input_df['revol_util'].str.replace('%', '').astype(float)
        return output_df
    
    def transform(self, input_df):
        input_df["term"] = self.transform_term(input_df)
        input_df["int_rate"] = self.transform_int_rate(input_df)
        input_df["revol_util"] = self.transform_revol_util(input_df)
        return input_df

preprocess = PreprocessFeatures()
X_train = preprocess.fit(X_train)
X_test = preprocess.transform(X_test)
X_train.head()

In [None]:
import category_encoders as ce

class ordinal_encoding_based_on_frequency_encoding(Base):
    def __init__(self,col):
        self.col = col
    
    def fit(self, input_df):
        output_df = pd.DataFrame()
        value_counted = input_df[self.col].value_counts().sort_values()
        mapping_dict= {value:i for i,value in enumerate(value_counted.index)}
        mapping = [{"col":self.col,"mapping":mapping_dict}]
        self.encoder_ = ce.OrdinalEncoder(mapping=mapping)
        output_df = self.encoder_.fit_transform(input_df[self.col])
        return output_df.add_prefix(f"ordinal_")
    
    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df = self.encoder_.transform(input_df[self.col])
        return output_df.add_prefix(f"ordinal_")

# test = ordinal_encoding_based_on_frequency_encoding("grade")
# test.fit(X_train)
# test.transform(X_test)

In [None]:
# stacked predictionによりテキストをメタ特徴量化する。
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import Binarizer
from scipy.sparse import hstack
from sklearn.linear_model import ElasticNet,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

class text_out_of_fold_prediction(Base):
  def __init__(self, col):
    self.nfolds = 5
    self.seed = 71
    self.col = col
    self.models = []
    self.regression = False
    self.scores = []
    

    
  def fit(self, input_df, y):
    x = input_df[self.col].fillna("nan")
    # self.vectorizer_ = TfidfVectorizer(max_features=100)
    self.vectorizer_ = CountVectorizer(max_features=100)
    x = self.vectorizer_.fit_transform(x)
    self.binerizer_ = Binarizer()
    x = self.binerizer_.fit_transform(x)
    
    if self.regression:
      fold = KFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
    else:
      fold = StratifiedKFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
    oof_train = np.zeros(len(input_df))
    # stacked prediction
    for i, (train_idx, valid_idx) in enumerate(fold.split(input_df[self.col], y)):
      train_x, train_y = x[train_idx], y.loc[train_idx]
      valid_x, valid_y = x[valid_idx], y.loc[valid_idx]
      
      if self.regression:
        clf = ElasticNet(random_state=self.seed)
        clf.fit(train_x, train_y)
        pred_y = clf.predict(valid_x)
        score = mean_squared_error(valid_y, pred_y)
      else:
        clf = LogisticRegression(penalty='elasticnet',solver='saga',random_state=self.seed, max_iter=10000, C=1.0, l1_ratio=0.5, class_weight='balanced')
        clf.fit(train_x, train_y)
        pred_y = clf.predict_proba(valid_x)[:,1]
        score = roc_auc_score(valid_y, pred_y)  
      print(f'CV Score of Fold_{i} is {score}')
      self.models.append(clf)
      self.scores.append(score)
      oof_train[valid_idx]= pred_y
      
    print(f"mean score is {np.mean(self.scores)}")
    output_df = pd.DataFrame()
    output_df[self.col] = oof_train
    return output_df.add_prefix("predicted_")
        
  def transform(self, input_df):
    x = input_df[self.col].fillna("nan")
    x = self.vectorizer_.transform(x)
    x = self.binerizer_.transform(x)
    oof_test = np.zeros(len(input_df))
    if self.regression:
      for clf in self.models:
        oof_test += clf.predict(x)
    else:
      for clf in self.models:
        oof_test += clf.predict_proba(x)[:,1]
    oof_test /= self.nfolds
    output_df = pd.DataFrame()
    output_df[self.col] = oof_test
    return output_df.add_prefix("predicted_")

test = text_out_of_fold_prediction("desc")
test.fit(X_train, y_train)
test.transform(X_test)

In [None]:
# テキストの重要堂やカウントを整理する
class text_check_insight():
    def __init__(self, col):
        self.col = col
        self.regression =False
        self.seed = 1
 
    def create_words_summary(self, clf, x):
        output_df = pd.DataFrame()
        output_df["features"] = self.vectorizer_.get_feature_names_out()
        output_df["wordcounts"] = (np.array(x.sum(axis=0)).flatten())
        output_df["coefs"] = clf.coef_.flatten()
        return output_df.sort_values(by="coefs").reset_index(drop=True)
    
    def __call__(self, input_df, y):
        x = input_df[self.col].fillna("nan")
        self.vectorizer_ = CountVectorizer(max_features=1000)
        x = self.vectorizer_.fit_transform(x)
        self.binerizer_ = Binarizer()
        x = self.binerizer_.fit_transform(x)
        
        if self.regression:
            clf = ElasticNet(random_state=self.seed)
        else:
            clf = LogisticRegression(penalty='elasticnet',solver='saga',random_state=self.seed, max_iter=10000, C=1.0, l1_ratio=0.5, class_weight='balanced')
        clf.fit(x, y)
        return self.create_words_summary(clf, x)

    
test = text_check_insight("desc")
df_word_summary = test(X_train, y_train)
df_word_summary

In [None]:
df

In [None]:
# ワードクラウドで可視化する。
# 閾値を設けてある一定の頻度のものだけ表示にしてもいいかも
# 今後Stopword除去も配慮する
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Create a dictionary where keys are words and values are corresponding wordcounts
wordcounts = dict(zip(df_word_summary['features'], df_word_summary['wordcounts']))

# Define a color function
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    coef = df_word_summary[df_word_summary['features'] == word]['coefs'].values[0]
    if coef > 0:
        return "hsl(0, 70%%, %d%%)" % (50 + coef * 100) # adjust these numbers to change color
    else:
        return "hsl(200, 70%%, %d%%)" % (50 - coef * 100) # adjust these numbers to change color

wordcloud = WordCloud(width=800, height=400, color_func=color_func,
                      prefer_horizontal=1.0).generate_from_frequencies(wordcounts)

plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# making all features
class RunBlocks(Base):
    def __init__(self):
        self.feature_blocks = [*[ordinal_encoding_based_on_frequency_encoding(col) for col in
                    ["grade", "sub_grade","emp_length","home_ownership","verification_status",
                     "purpose","zip_code","addr_state",]]]
        self.stacked_predict_feature = [*[text_out_of_fold_prediction(col) for col in ["desc","title"]]]
        self.use_original_values = ["loan_amnt","funded_amnt","funded_amnt_inv","term",
                                "int_rate","installment","annual_inc"]
        
    def fit(self,input_df, df_y):
        output_df = pd.DataFrame()
        output_df = input_df[self.use_original_values]
                
        for block in self.feature_blocks:
            output_i = block.fit(input_df)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
        
        for block in self.stacked_predict_feature:
            output_i = block.fit(input_df,df_y)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
        return output_df
    
    def transform(self,input_df):
        output_df = pd.DataFrame()
        output_df = input_df[self.use_original_values]
        
        for block in self.feature_blocks:
            output_i = block.transform(input_df)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
            
        for block in self.stacked_predict_feature:
            output_i = block.transform(input_df)
            assert len(input_df) == len(output_i), block
            output_df = pd.concat([output_df,output_i],axis=1)
        return output_df
        
run_blocks = RunBlocks()
df_train = run_blocks.fit(X_train, y_train)
df_test = run_blocks.transform(X_test)
df_train



モデリング

In [None]:
import lightgbm as lgb