# 重要度算出するやつ

In [4]:
import pandas as pd
from minepy import MINE
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [1]:
class calculate_importance():
    
    def __new__(self):
        '''
        イニシャライザ
        
        all_rf_importance (pandas.DataFrame): ランダムフォレストの重要度を格納するdataframe
        all_etr_importance (pandas.DataFrame): extratreesregressorの重要度を格納するdataframe
        all_rf_importance (pandas.DataFrame): ランダムフォレストの重要度を格納するdataframe
        all_rf_importance (pandas.DataFrame): ランダムフォレストの重要度を格納するdataframe
        '''
        self.all_rf_importance = pd.DataFrame()
        self.all_etr_importance = pd.DataFrame()
        self.all_mic = pd.DataFrame()
        self.all_corr = pd.DataFrame()
    
    
    def __init__(self, variables, purpose):
        '''
        目的変数に対する重要度や相関係数を算出するクラス

        Args:
            variables (pandas.DataFrame): 説明変数を格納したdataframe
            purpose (pandas.DataFrame): 目的変数を格納したdataframe
        '''
        # 特徴量と目的変数に分ける
        X = variables.copy()
        y = purpose.copy()

        # 特徴量名を取り出しておく
        feature_x = X.columns

        for column in y.columns:
            print(column)
            rf_importance = self.__calculate_rf_importance(X, y[column])
            print(" rf")
            etr_importance = self.__calculate_etr_importance(X, y[column])
            print(" etr")
            mic_list = self.__mic_calculation(X, y[column])
            print(" mic")
            corr_list = self.__corr_calculation(X, y[column])
            print(" corr")

            rf_importance = pd.Series(rf_importance, index=feature_x, name=column)
            etr_importance = pd.Series(etr_importance, index=feature_x, name=column)
            mic = pd.Series(mic_list, index=feature_x, name=column)
            corr = pd.Series(corr_list, index=feature_x, name=column)

            self.all_corr = pd.concat([self.all_corr, corr], axis=1, sort=False)
            self.all_mic = pd.concat([self.all_mic, mic], axis=1, sort=False)
            self.all_rf_importance = pd.concat([self.all_rf_importance, rf_importance], axis=1, sort=False)
            self.all_etr_importance = pd.concat([self.all_etr_importance, etr_importance], axis=1, sort=False)
        self.all_corr = self.all_corr.fillna(0)


    def __calculate_rf_importance(self, X, y):
        '''
        目的変数に対する説明変数の需要度を算出する

        Args:
            X (pandas.DataFrame): 説明変数を格納したdataframe
            y (pandas.Series): 目的変数を格納したSeries
        Return:
            
        '''
        reg_rf = RandomForestRegressor(random_state=0, n_estimators=100)
        reg_rf.fit(X=X, y=y)
        return reg_rf.feature_importances_


    def __calculate_etr_importance(self, X, y):
        '''
        目的変数に対する説明変数の需要度を算出する

        Args:
            X (pandas.DataFrame): 説明変数を格納したdataframe
            y (pandas.Series): 目的変数を格納したSeries
        '''
        reg_etr = ExtraTreesRegressor(random_state=0, n_estimators=100)
        reg_etr.fit(X=X, y=y)
        return reg_etr.feature_importances_


    def __mic_calculation(self, X, y):
        '''
        目的変数に対する説明変数の非線形相関係数を算出する

        Args:
            X (pandas.DataFrame): 説明変数を格納したdataframe
            y (pandas.Series): 目的変数を格納したSeries
        '''
        mine = MINE()
        mic_list = np.array([])
        for n, column in enumerate(X.columns):
            mine.compute_score(y, X[column])
            mic_list = np.append(mic_list, mine.mic())
        return mic_list


    def __corr_calculation(self,X, y):
        '''
        目的変数に対する説明変数の相関係数を算出する

        Args:
            X (pandas.DataFrame): 説明変数を格納したdataframe
            y (pandas.Series): 目的変数を格納したSeries
        '''
        corr_list = np.array([])
        for n, column in enumerate(X.columns):
            np.seterr(divide='ignore', invalid='ignore')
            corr_list = np.append(corr_list, np.corrcoef(y, X[column])[0,1])
        return corr_list


    def rank_feature(self, output_name):
        '''
        総関係数や重要度から算出したスコアで説明変数に順位をつける

        Args:
            output_name (String): 保存するエクセルのファイル名
        '''
            corr = self.all_corr
            mic = self.all_mic
            rf = self.all_rf_importance
            etr = self.all_etr_importance
            writer = pd.ExcelWriter(f"../output/{datetime.datetime.today().strftime('%Y%m%d%H')}_{output_name}")
            for column in corr.columns:
                print(column)
                rank = pd.DataFrame(index=corr.index)
                rank["corr_rank"] = corr[column].abs().apply(lambda x: 5 if x > 0.6 else 4 if x > 0.4 else 3 if x > 0.25 else 2 if x > 0.16 else 1)
                rank["corr"] = corr[column]
                rank["mic_rank"] = mic[column].apply(lambda x: 5 if x > 0.6 else 4 if x > 0.4 else 3 if x > 0.25 else 2 if x > 0.16 else 1)
                rank["mic"] = mic[column]
                rank["rf_rank"] = rank_importance(rf, column).astype("int")
                rank["rf"] = rf[column]
                rank["etr_rank"] = rank_importance(etr, column).astype("int")
                rank["etr"] = rf[column]
                rank_index = [column for column in corr.index]
                rank = rank.loc[rank_index][["corr", "mic", "rf", "etr"]].abs()
                mm = preprocessing.MinMaxScaler()
                rank = pd.DataFrame(mm.fit_transform(rank), index=rank.index, columns=rank.columns)
                rank["score"] = rank.apply(lambda x: x["corr"]+x["mic"]+x["rf"]+x["etr"], axis=1)
                rank["score"] = rank["score"].rank(ascending=False)
                rank.to_excel(writer, sheet_name=column)
                writer.save()

# クロス集計とかカイ二乗検定するやつ

In [21]:
class cvx():
    
    def __init__(self, variables, purpose, split_num=5):
        self.variables = variables
        self.purpose = purpose
        self.split_num = split_num
        
        
    def calculate_x2(self, purpose, variables, purpose_type):
        purpose_columns = [column for column in purpose.columns if purpose_type in column]
        df_list = []
        for p_column in purpose_columns:
            all_x2_df = self.__x2_calculate(p_column)
            all_cross_df = self.__crosstab(p_column)
            df_list.append(pd.concat([all_cross_df, all_x2_df], axis=1, sort=False))
        return df_list

    
    def self.__crosstab(self, p_column):
        """
        """
        all_cross_df = pd.DataFrame()
        for v_column in self.variables.columns:
            df_list = self.__split_dataframe(self.variables, v_column, self.split_num) # variablesをsplit_num個に分割する
            p_num = len(self.purpose[p_column].unique())
            if p_num==2: # 目的変数が２値データの場合
                cross_df = pd.DataFrame(index=[0, 1])
                for i in range(self.split_num):
                    length = len(self.purpose.loc[df_list[i].index])
                    cross_aggregation = self.purpose.loc[df_list[i].index][p_column].value_counts() / length
                    cross_aggregation.name = f"カテゴリ{i}"
                    cross_df = pd.concat([cross_df, (cross_aggregation)], axis=1, sort=False)
                cross_df = cross_df.fillna(0)
                cross_df.index = [f"{v_column}" for index in cross_df.index]
                all_cross_df = all_cross_df.append(cross_df.iloc[1])
            elif p_num>2: # 目的変数が連続値データの場合
                cross_list = []
                cross_df = pd.DataFrame()
                column_list = []
                for i in range(self.split_num):
                    cross_value = self.purpose.loc[df_list[i].index][p_column].sum()
                    cross_list.append(cross_value)
                    column_list.append(f"カテゴリ{i}")
                cross_df = pd.DataFrame([cross_list], columns=column_list, index=[v_column])
                all_cross_df = all_cross_df.append(cross_df)
            else:
                raise Purpose_num_error(p_column, p_num)
        all_cross_df["平均"] = all_cross_df[column_list].mean(axis=1)
        all_cross_df["分散"] = all_cross_df[column_list].var(axis=1)
        all_cross_df = self.__date_5clastter(all_cross_df)
        return all_cross_df


    def __split_dataframe(self, data, column, n):
        """pandas.DataFrameのcolumn列を小さい順にソートして
        n個に分割し、その分割したdataframeをリストに格納する。
        """
        df = data[column].sort_values()
        df_list = []
        index_list = np.array_split(np.arange(len(df)), n)
        for i in range(n):
            df_list.append(df.iloc[index_list[i]])
        return df_list


    def __date_5clastter(self, df):
        """
        """
        mean_04 = df[["カテゴリ0", "カテゴリ4"]].mean(axis=1).rename("平均_04")
        mean_123 = df[["カテゴリ1", "カテゴリ2", "カテゴリ3"]].mean(axis=1).rename("平均_123")
        diff_40 = (df["カテゴリ4"] - df["カテゴリ0"]).rename("差分40")

        shape_df = pd.concat([df, mean_04, mean_123, diff_40], axis=1)
        shape_list = []
        for i, _ in enumerate(shape_df.index):
            shape_df_ = shape_df.iloc[i]
            try:
                if (shape_df_["平均_123"] > shape_df_["カテゴリ0"]) & (shape_df_["平均_123"] > shape_df_["カテゴリ4"]):
                    shape_list.append("山型")
                elif (shape_df_["平均_123"] < shape_df_["カテゴリ0"]) & (shape_df_["平均_123"] < shape_df_["カテゴリ4"]):
                    shape_list.append("谷型")
                elif shape_df_["差分40"] > 0:
                    shape_list.append("右肩上がり")
                elif shape_df_["差分40"] < 0:
                    shape_list.append("右肩下がり")
                else:
                    shape_list.append("特徴なし")
            except:
                print(shape_df_)
        df["データ分類"] = shape_list
        return df


    def self.__x2_calculate(self,p_column):
        x2_list = []
        for v_column in self.variables.columns:
            df_list = self.__split_dataframe(self.variables, v_column, self.split_num)
            p_num = len(self.purpose[p_column].unique())
            if p_num==2:
                x2_df = pd.DataFrame(index=[0, 1])
                for i in range(self.split_num):
                    x2_aggregation = self.purpose.loc[df_list[i].index][p_column].value_counts()
                    x2_aggregation.name = f"{v_column}_カテゴリ{i}"
                    x2_df = pd.concat([x2_df, (x2_aggregation)], axis=1, sort=False)
                x2_df = x2_df.fillna(0).T
                try:
                    x2, p, dof, expected = sp.stats.chi2_contingency(x2_df.values)
                    x2_list.append(x2)
                except:
                    x2_list.append(np.nan)
            elif p_num>2: # 目的変数が連続値データの場合
                x2_posi_list = []
                x2_neg_list = []
                c_df=pd.DataFrame()
                column_list = []
                for i in range(self.split_num):
                    x2_aggregation = self.purpose.loc[df_list[i].index][p_column].sum()
                    x2_posi_list.append(x2_aggregation)
                    x2_neg_list.append((len(self.purpose)*128*2/5)-x2_aggregation)
                    column_list.append(f"カテゴリ{i}")
                x2_df = pd.DataFrame([x2_posi_list, x2_neg_list], columns=column_list, index=[1, 0])
                x2_df = x2_df.fillna(0).T
                try:
                    x2, p, dof, expected = sp.stats.chi2_contingency(x2_df.values)
                    x2_list.append(x2)
                except Exception as e:
                    print(x2_df)
                    raise e
                    x2_list.append(np.nan)
            else:
                raise Purpose_num_error(p_column, p_num)
        all_x2_df = pd.Series(x2_list, name="x2", index=self.variables.columns)
        return all_x2_df


    def __rank_importance(self, df, column):
        df = df.sort_values(column, ascending=False)
        df["rank"] = range(1, len(df)+1)
        df["rank"] = df["rank"]/len(df)
        df["rank"] = df["rank"].where(df["rank"] > 0.1, 5)
        df["rank"] = df["rank"].where(df["rank"] > 0.2, 4)
        df["rank"] = df["rank"].where(df["rank"] > 0.3, 3)
        df["rank"] = df["rank"].where(df["rank"] > 0.4, 2)
        df["rank"] = df["rank"].where(df["rank"] > 2, 1)
        return df["rank"]
                    

In [199]:
import re

In [3]:
def read_excel(dir_name, file_name, sheet_name = 0):
    """保存したpickleファイルの中で最新バージョンのものを取得する
    """
    file_list = glob(f'.{os.path.sep}{dir_name}{os.path.sep}*[0-9]{10}_{file_name}', recursive=True)
    file_list = [re.search(r'([0-9]{10})', i).group() for i in file_list ]
    file_ver = max(file_list)
    print(f'read:{file_ver}_{file_name}')
    data = pd.read_excel(f'.{os.path.sep}{dir_name}{os.path.sep}{file_ver}_{file_name}', sheet_name = sheet_name)
    return data

# オリジナルエラークラス

## 目的変数のユニーク数が１のときエラーを出力

In [19]:
class Purpose_num_error(IOError):
        # Exceptionは、args属性を作るけど、このMyErrorでは、errorType,errorNumを作る
    def __init__(self, column, p_num):
        self.p_column=column
        self.errorNum=p_num
    def __str__(self):
        return f'目的変数{self.p_column}のユニーク数が{self.errorNum}です'

In [53]:
try:
    raise Purpose_num_error("nanka",1)
except Purpose_num_error as e:
    print(e)

目的変数nankaのユニーク数が1です


In [7]:
raise Purpose_num_error("nanka",1)

NameError: name 'Purpose_num_error' is not defined