In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D
import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px
from sklearn.decomposition import PCA
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
# matplotlib 한글 출력..
import matplotlib as mpl
import matplotlib.font_manager as fm
mpl.rcParams['axes.unicode_minus'] = False
path = 'C:/Windows/Fonts/malgun.ttf'
font_name = fm.FontProperties(fname=path, size=50).get_name()
plt.rc('font', family=font_name)

In [None]:
def hit_pit_div(df_train, df_test):
    df_hit_mvp = df_train[(df_train.mvp == 1) & (df_train.player_IP == 0)].reset_index(drop=True)
    df_pit_mvp = df_train[(df_train.mvp == 1) & (df_train.player_IP != 0)].reset_index(drop=True)
    df_hit_all = df_train[(df_train.mvp == 0) & (df_train.player_IP == 0)].reset_index(drop=True)
    df_pit_all = df_train[(df_train.mvp == 0) & (df_train.player_IP != 0)].reset_index(drop=True)
    df_hit_mvp = df_hit_mvp.drop(['player_ERA', 'player_GP', 'player_CG', 'player_SHO',
           'player_W', 'player_L', 'player_SV', 'player_HLD', 'player_WPCT',
           'player_TBF', 'player_IP', 'player_HP', 'player_HRP', 'player_BBP',
           'player_HBPP', 'player_SOP', 'player_RP', 'player_ER', 'player_name',
           'mvp','player_2B','player_3B'], axis=1)
    df_hit_all = df_hit_all.drop(['player_ERA', 'player_GP', 'player_CG', 'player_SHO',
           'player_W', 'player_L', 'player_SV', 'player_HLD', 'player_WPCT',
           'player_TBF', 'player_IP', 'player_HP', 'player_HRP', 'player_BBP',
           'player_HBPP', 'player_SOP', 'player_RP', 'player_ER', 'player_name',
           'mvp','player_2B','player_3B'], axis=1)
    df_pit_mvp = df_pit_mvp.drop(['player_AVG', 'player_G', 'player_PA', 'player_AB',
           'player_RH', 'player_HH', 'player_2B', 'player_3B', 'player_HRH',
           'player_TB', 'player_RBI', 'player_SB', 'player_CS', 'player_BBH',
           'player_HBPH', 'player_SOH', 'player_GDP', 'player_SLG', 'player_OBP',
           'player_E','mvp','player_name'],axis=1)
    df_pit_all = df_pit_all.drop(['player_AVG', 'player_G', 'player_PA', 'player_AB',
           'player_RH', 'player_HH', 'player_2B', 'player_3B', 'player_HRH',
           'player_TB', 'player_RBI', 'player_SB', 'player_CS', 'player_BBH',
           'player_HBPH', 'player_SOH', 'player_GDP', 'player_SLG', 'player_OBP',
           'player_E','mvp','player_name'],axis=1)
    return df_hit_mvp, df_hit_all, df_pit_mvp, df_pit_all

In [None]:
def radar_factory(num_vars, frame='circle'):
    theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)

    class RadarAxes(PolarAxes):
        name = 'radar'
        RESOLUTION = 1
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.set_theta_zero_location('N')
        def fill(self, *args, closed=True, **kwargs):
            return super().fill(closed=closed, *args, **kwargs)
        def plot(self, *args, **kwargs):
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)
        def _close_line(self, line):
            x, y = line.get_data()
            if x[0] != x[-1]:
                x = np.append(x, x[0])
                y = np.append(y, y[0])
                line.set_data(x, y)
        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)
        def _gen_axes_patch(self):
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars, radius=.5, edgecolor="k")
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)
        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                spine = Spine(axes=self,
                              spine_type='circle',
                              path=Path.unit_regular_polygon(num_vars))
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
                                    + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

    register_projection(RadarAxes)
    return theta

In [None]:
def example_data(df_train, df_test):
    df_hit_mvp, df_hit_all, df_pit_mvp, df_pit_all = hit_pit_div(df_train, df_test)
    data = [
        list(df_hit_mvp),
        list(df_pit_mvp),
        ('Hitter MVP VS ALL', [df_hit_mvp.mean().tolist(), df_hit_all.mean().tolist(), 
                               df_hit_mvp.sample().values.tolist()[0], df_hit_all.sample().values.tolist()[0]]),
        ('Pitcher MVP VS ALL', [df_pit_mvp.mean().tolist(), df_pit_all.mean().tolist(), 
                                df_pit_mvp.sample().values.tolist()[0], df_pit_all.sample().values.tolist()[0]])]
    return data

In [None]:
def Components(df_train, df_test):
    df_hit_mvp, df_hit_all, df_pit_mvp, df_pit_all = hit_pit_div(df_train, df_test)
    hit = pd.DataFrame((df_hit_mvp.mean() - df_hit_all.mean()).sort_values(ascending=False)[:10]).reset_index()
    pit = pd.DataFrame((df_pit_mvp.mean() - df_pit_all.mean()).sort_values(ascending=False)[:10]).reset_index()
    components = pd.concat([hit, pit], axis=1)
    display(components)

In [None]:
def EDA_P(df_train, df_test):
    theta = [radar_factory(19, frame='polygon'),radar_factory(19, frame='polygon')]
    data = example_data(df_train, df_test)
    spoke_labels = [data.pop(0), data.pop(0)]
    fig, axs = plt.subplots(figsize=(21, 21), nrows=1, ncols=2,
                            subplot_kw=dict(projection='radar'))
    fig.subplots_adjust(wspace=0.25, hspace=0.02, top=0.85, bottom=0.05)
    colors = ['#EEA36E', '#D96459', '#8C4646', '#F6D394']
    i = 0
    for ax, (title, case_data) in zip(axs.flat, data):
        ax.set_rgrids([0.2, 0.4, 0.6, 0.8])
        ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),
                     horizontalalignment='center', verticalalignment='center')
        j = 0
        for d, color in zip(case_data, colors):
            
            if j in [0,1]:
                ax.fill(theta[i], d, facecolor=color, alpha=0.9)
            else:
                ax.plot(theta[i], d, color=color,linewidth=3)
            j+=1
        ax.set_varlabels(spoke_labels[i])
        i += 1
    labels = ('MVP', 'ALL', 'MVP_SAMPLE', 'ALL_SAMPLE')
    legend = axs[1].legend(labels, loc=(1, .95),
                              labelspacing=0.1, fontsize='small')
    fig.text(0.5, 0.695, 'KBO MVP VS ALL',
             horizontalalignment='center', color='black', weight='bold',
             size='large')
    plt.show()
    Components(df_train, df_test)

In [145]:
def PCA_P(df_train, df_test):
    df_hit_mvp, df_hit_all, df_pit_mvp, df_pit_all = hit_pit_div(df_train, df_test)
    df_hit_mvp['mvp'] = 0
    df_hit_all['mvp'] = 1
    df_pit_mvp['mvp'] = 0
    df_pit_all['mvp'] = 1    
    df_hit = pd.concat([df_hit_mvp, df_hit_all])
    df_pit = pd.concat([df_pit_mvp, df_pit_all])

    for df in [df_hit, df_pit]:
        features = df.columns.tolist()
        X = df[features]

        pca = PCA(n_components=2)
        components = pca.fit_transform(X)
        loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
        fig = px.scatter(components, x=0, y=1, color=df['mvp'])

        for i, feature in enumerate(features):
            fig.add_shape(type='line', x0=0, y0=0, x1=loadings[i, 0], y1=loadings[i, 1])
            fig.add_annotation(x=loadings[i, 0], y=loadings[i, 1], ax=0, ay=0, xanchor="center", yanchor="bottom", text=feature,)
        fig.show()
        df1 = pd.DataFrame(abs(pca.components_))
        df1.columns = df.columns
        df_x = df1.T.sort_values(0, ascending=False)[:3].drop(1,axis=1)
        df_x.reset_index(inplace=True)
        df_y = df1.T.sort_values(1, ascending=False)[:3].drop(0,axis=1)
        df_y.reset_index(inplace=True)
        df1 = pd.concat([df_x, df_y], axis=1)
        display(df1)

In [146]:
def MVP_P(df_train, df_test):
    y = df_train['mvp']
    x = df_train.drop(['player_name', 'mvp'], axis=1)
    x1 = df_test.drop(['player_name'], axis=1)#, 'mvp'

    trr = RandomForestClassifier().fit(x, y)
    preds = trr.predict_proba(x1)
    list1 = preds[:,1]

    df_mvp = df_test.copy()
    df_mvp['per'] = list1
    df_mvp_2 = df_mvp[['player_name','per']].sort_values('per', ascending=False)[:5]
    plt.bar(df_mvp_2['player_name'], df_mvp_2['per'], data = df_mvp_2, color='#EEA36E')
    display(df_mvp_2)