In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
def check_base_info(input_path, filename, output_path, minimal=False):
    df = pd.read_csv(input_path + filename + '.csv',  engine = 'python')
    col_names = df.columns.to_series()
    col_dtypes = df.dtypes
    col_na_count = df.isnull().sum()
    col_describe = df.describe().T
    col_unique = df.nunique()

    base_info = pd.concat([col_names, col_dtypes, col_na_count, col_unique], 
                          axis=1, keys=['col_names', 'col_dtypes', 
                                        'col_na_count', 'col_unique'], sort=False)
    base_info = pd.concat([base_info, col_describe], axis=1, sort=False)

    base_info.to_csv(output_path + filename + 'base_info.csv')
    
    df_report = ProfileReport(df, minimal=minimal)
    df_report.to_file(output_file = output_path + filename + '.html')

def check_missing(df):
    col_names = df.columns.to_series()
    col_dtypes = df.dtypes
    col_na_count = df.isnull().sum()
    col_describe = df.describe().T
    col_unique = df.nunique()

    base_info = pd.concat([col_names, col_dtypes, col_na_count, col_unique], 
                          axis=1, keys=['col_names', 'col_dtypes', 
                                        'col_na_count', 'col_unique'], sort=False)
    base_info = pd.concat([base_info, col_describe], axis=1, sort=False)
    
    return base_info

# 创建比赛ID，所有队伍交叉可能（不考虑区域），男队ID 为1000-1999, 数据中 ID数为1101-1467

In [3]:
input_path_mdata = f'./input/rawdata/MDataFiles_Stage1/'
input_path_mevents = f'./input/rawdata/'
output_path = f'./output/'

In [4]:
list_TeamID = [x for x in range(1101, 1468)]
list_Season = [x for x in range(1985, 2020)]

ID = []
Season = []
TeamID_a = []
TeamID_b = []

for item in  itertools.product(list_Season, list_TeamID, list_TeamID):
    if item[1] < item[2]:
        _ = str(item[0]) + '_' + str(item[1]) + '_' + str(item[2]) 
        ID.append(_)
        Season.append(item[0])
        TeamID_a.append(item[1])
        TeamID_b.append(item[2])

        
df_datamart = pd.concat([pd.Series(ID), pd.Series(Season), 
                pd.Series(TeamID_a), pd.Series(TeamID_b)], 
               axis=1, keys=['ID', 'Season', 'TeamID_a', 'TeamID_b' ])

In [5]:
check_missing(df_datamart)

Unnamed: 0,col_names,col_dtypes,col_na_count,col_unique,count,mean,std,min,25%,50%,75%,max
ID,ID,object,0,335805,,,,,,,,
Season,Season,int64,0,5,335805.0,2017.0,1.414216,2015.0,2016.0,2017.0,2018.0,2019.0
TeamID_a,TeamID_a,int64,0,366,335805.0,1222.666667,86.384284,1101.0,1150.0,1208.0,1284.0,1466.0
TeamID_b,TeamID_b,int64,0,366,335805.0,1345.333333,86.384284,1102.0,1284.0,1360.0,1418.0,1467.0


In [6]:
print(df_datamart.head(100))

print(df_datamart.shape)

                ID  Season  TeamID_a  TeamID_b
0   2015_1101_1102    2015      1101      1102
1   2015_1101_1103    2015      1101      1103
2   2015_1101_1104    2015      1101      1104
3   2015_1101_1105    2015      1101      1105
4   2015_1101_1106    2015      1101      1106
..             ...     ...       ...       ...
95  2015_1101_1197    2015      1101      1197
96  2015_1101_1198    2015      1101      1198
97  2015_1101_1199    2015      1101      1199
98  2015_1101_1200    2015      1101      1200
99  2015_1101_1201    2015      1101      1201

[100 rows x 4 columns]
(335805, 4)


# 加入target_p1，target_p2 此部分为对阵比赛结果，target_p1为分差，用score_diff表示，target_p2为胜负，用win表示
### [MRegularSeasonDetaileResults.csv] [MNCAATourneyDetailedResults.csv]两个文件中提取出胜负结果及分差，另加一列来区别常规赛和淘汰赛

In [7]:
df_regular_results = pd.read_csv(input_path_mdata + 'MRegularSeasonDetailedResults.csv', engine = 'python')
df_tourney_results = pd.read_csv(input_path_mdata + 'MNCAATourneyDetailedResults.csv', engine = 'python')

In [8]:
df_regular_results.head(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [9]:
df_tourney_results.head(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


### 说明：文件 [MRegularSeasonDetaileResults.csv] [MNCAATourneyDetailedResults.csv] 是文件 【MRegularSeasonCompactResults.csv】【MNCAATourneyCompactResults.csv】的扩展版本
### 前8列完全相同，后面追加的为整场比赛中的统计数据（此处要跟event区分开，event是发生每个活动都记录，每场比赛统计多次， 而detaile是每场比赛结束时，统计一次总数）
### WTO-turnovers committed？？ WBlk-blocks？？
### 每列说明： WFGM - 投篮投中次数（所有，包括2分，3分，不含罚篮） WFGA-投篮次数（比如5投3中 的5） WFGM3-3分球投中次数 WFGA3-3分球投篮次数 WFTM-罚球中次数 WFTA-罚球次数 
### WOR-进攻篮板 WDR-防守篮板 WAst-助攻 WTO-turnovers committed？？ WStl-抢断 WBlk-blocks？？ WPF-犯规

In [10]:
#Add a lable variable to the two datasets

df_regular_results['Label'] = 'Reg'
df_tourney_results['Label']= 'Tourney'

In [11]:
df_regular_results.shape

(87504, 35)

## Feature Engineering - Adding features for DataMart

### 1. Adding Score Difference

In [14]:
def score_diff(dfin):
    df = dfin.copy()
    df['id_diff'] = df['WTeamID'] - df['LTeamID'] #Variable used to create ID matched with datamart
    '''Rule: if Win team has smaller ID number than Lost team - > Score_diff = Wscore - LScore
            else: Score_diff = LScore - Wscore since the team ID order will be exchanged'''
    '''[Wloc] identifies the "location" of the winning team. 
            If the winning team was the home team, this value will be "H". 
            If the winning team was the visiting team, this value will be "A". 
            If it was played on a neutral court, then this value will be "N".'''

    df['win'] = df.apply(lambda x: 1 if x.id_diff<0 else 0, axis=1) 
    df['LLoc'] = df.apply(lambda x: 'H' if x.WLoc == 'A' else('A' if x.WLoc == 'H' else 'N'), axis =1)
    
    df['Team_a_Loc'] = df.apply(lambda x: x.WLoc if x.id_diff<0 else x.LLoc, axis=1)
    df['Team_a_FGM2_rate'] = df.apply(lambda x: (x.WFGM - x.WFGM3) / (x.WFGA - x.WFGA3) if x.id_diff <0 else (x.LFGM - x.LFGM3) / (x.LFGA - x.LFGA3), axis=1) 
    df['Team_a_FGM3_rate'] = df.apply(lambda x: x.WFGM3 / x.WFGA3 if x.id_diff <0 else x.LFGM3 / x.LFGA3, axis=1)
    df['Team_a_FTM_count'] = df.apply(lambda x: x.WFTA if x.id_diff <0 else x.LFTA, axis=1) 
    df['Team_a_OR'] = df.apply(lambda x: x.WOR if x.id_diff<0 else x.LOR, axis =1)
    df['Team_a_DR'] = df.apply(lambda x: x.WDR if x.id_diff<0 else x.LDR, axis =1)
    df['Team_a_Ast'] = df.apply(lambda x: x.WAst if x.id_diff<0 else x.LAst, axis =1)
    df['Team_a_TO'] = df.apply(lambda x: x.WTO if x.id_diff<0 else x.LTO, axis =1)
    df['Team_a_Stl'] = df.apply(lambda x: x.WStl if x.id_diff<0 else x.LStl, axis =1)
    df['Team_a_Blk'] = df.apply(lambda x: x.WBlk if x.id_diff<0 else x.LBlk, axis =1)
    df['Team_a_PF'] = df.apply(lambda x: x.WPF if x.id_diff<0 else x.LPF, axis =1)
    
    df['Team_b_Loc'] = df.apply(lambda x: x.LLoc if x.id_diff<0 else x.WLoc, axis=1)
    df['Team_b_FGM2_rate'] = df.apply(lambda x: (x.LFGM - x.LFGM3) / (x.LFGA - x.LFGA3) if x.id_diff <0 else (x.WFGM - x.WFGM3) / (x.WFGA - x.WFGA3), axis=1) 
    df['Team_b_FGM3_rate'] = df.apply(lambda x: x.LFGM3 / x.LFGA3 if x.id_diff <0 else x.WFGM3 / x.WFGA3, axis=1)
    df['Team_b_FTM_count'] = df.apply(lambda x: x.LFTA if x.id_diff <0 else x.WFTA, axis=1) 
    df['Team_b_OR'] = df.apply(lambda x: x.LOR if x.id_diff<0 else x.WOR, axis =1)
    df['Team_b_DR'] = df.apply(lambda x: x.LDR if x.id_diff<0 else x.WDR, axis =1)
    df['Team_b_Ast'] = df.apply(lambda x: x.LAst if x.id_diff<0 else x.WAst, axis =1)
    df['Team_b_TO'] = df.apply(lambda x: x.LTO if x.id_diff<0 else x.WTO, axis =1)
    df['Team_b_Stl'] = df.apply(lambda x: x.LStl if x.id_diff<0 else x.WStl, axis =1)
    df['Team_b_Blk'] = df.apply(lambda x: x.LBlk if x.id_diff<0 else x.WBlk, axis =1)
    df['Team_b_PF'] = df.apply(lambda x: x.LPF if x.id_diff<0 else x.WPF, axis =1)
    
    df['Score_diff'] = df.apply(lambda x: x.WScore - x.LScore if x.id_diff <0 else x.LScore-x.WScore, axis = 1) 
    for i in ['Season','WTeamID','LTeamID']:
        df[i] = df[i].apply(lambda x: str(x))
    df['ID'] = df.apply(lambda x: x.Season+'_'+x.WTeamID+'_'+x.LTeamID if x.id_diff <0 else x.Season +'_'+ x.LTeamID+'_'+x.WTeamID,axis= 1)
    
    return df

In [15]:
#Apply score_diff to both regular season and tourney
df_list = [df_regular_results,df_tourney_results]

df_sdiff = pd.DataFrame()
for i in df_list:
    temp = score_diff(i)
    print(temp.shape)
    df_sdiff = df_sdiff.append(temp)

print(df_sdiff.shape)
print(df_sdiff.head())

(26986, 62)
(1115, 62)
(28101, 62)
      Season  DayNum WTeamID  WScore LTeamID  LScore WLoc  NumOT  WFGM  WFGA  \
60518   2015      11    1103      74    1420      57    H      0    25    53   
60519   2015      11    1104      82    1406      54    H      0    29    63   
60520   2015      11    1112      78    1291      55    H      0    31    54   
60521   2015      11    1113      86    1152      50    H      0    30    49   
60522   2015      11    1119      84    1102      78    H      0    30    61   

       ...  Team_b_FTM_count  Team_b_OR  Team_b_DR  Team_b_Ast  Team_b_TO  \
60518  ...                28         12         23          13         16   
60519  ...                15         14         20           9         22   
60520  ...                12          4         24           8         16   
60521  ...                23         15         16           7         17   
60522  ...                17         17         20          18         15   

       Team_b_Stl  Te

In [16]:
#Keep only ID and Score_diff and merged with DataMart

df_sdiff_formerge = df_sdiff[[
                                'ID', 
                                'DayNum',
                                'Team_a_Loc',
                                'Team_a_FGM2_rate',
                                'Team_a_FGM3_rate',
                                'Team_a_FTM_count',
                                'Team_a_OR',
                                'Team_a_DR',
                                'Team_a_Ast',
                                'Team_a_TO',
                                'Team_a_Stl',
                                'Team_a_Blk',
                                'Team_a_PF',
                                'Team_b_Loc',
                                'Team_b_FGM2_rate',
                                'Team_b_FGM3_rate',
                                'Team_b_FTM_count',
                                'Team_b_OR',
                                'Team_b_DR',
                                'Team_b_Ast',
                                'Team_b_TO',
                                'Team_b_Stl',
                                'Team_b_Blk',
                                'Team_b_PF',
                                'Score_diff',
                                'Label',
                                'win'
                             ]]

df_datamart_add1 = pd.merge(left = df_datamart,right = df_sdiff_formerge, on = 'ID',how = 'left')

print(df_datamart_add1.shape)

print(df_datamart_add1.head())

(343776, 29)
               ID  Season  TeamID_a  TeamID_b Team_a_Loc  Team_a_FGM2_rate  \
0  2015_1101_1102    2015      1101      1102        NaN               NaN   
1  2015_1101_1103    2015      1101      1103        NaN               NaN   
2  2015_1101_1104    2015      1101      1104        NaN               NaN   
3  2015_1101_1105    2015      1101      1105        NaN               NaN   
4  2015_1101_1106    2015      1101      1106        NaN               NaN   

   Team_a_FGM3_rate  Team_a_FTM_count  Team_a_OR  Team_a_DR  ...  Team_b_OR  \
0               NaN               NaN        NaN        NaN  ...        NaN   
1               NaN               NaN        NaN        NaN  ...        NaN   
2               NaN               NaN        NaN        NaN  ...        NaN   
3               NaN               NaN        NaN        NaN  ...        NaN   
4               NaN               NaN        NaN        NaN  ...        NaN   

   Team_b_DR  Team_b_Ast  Team_b_TO  Team_b

In [17]:
check_missing(df_datamart_add1)

Unnamed: 0,col_names,col_dtypes,col_na_count,col_unique,count,mean,std,min,25%,50%,75%,max
ID,ID,object,0,335805,,,,,,,,
Season,Season,int64,0,5,343776.0,2017.000108,1.41425,2015.0,2016.0,2017.0,2018.0,2019.0
TeamID_a,TeamID_a,int64,0,366,343776.0,1222.709927,86.379098,1101.0,1150.0,1208.0,1284.0,1466.0
TeamID_b,TeamID_b,int64,0,366,343776.0,1345.352904,86.354654,1102.0,1284.0,1360.0,1418.0,1467.0
Team_a_Loc,Team_a_Loc,object,316455,3,,,,,,,,
Team_a_FGM2_rate,Team_a_FGM2_rate,float64,316455,607,27321.0,0.491186,0.096218,0.060606,0.428571,0.487805,0.555556,0.88
Team_a_FGM3_rate,Team_a_FGM3_rate,float64,316455,327,27321.0,0.344757,0.111458,0.0,0.272727,0.344828,0.416667,1.0
Team_a_FTM_count,Team_a_FTM_count,float64,316455,57,27321.0,19.876066,7.795065,0.0,14.0,19.0,25.0,56.0
Team_a_OR,Team_a_OR,float64,316455,33,27321.0,10.220234,3.979421,0.0,7.0,10.0,13.0,38.0
Team_a_DR,Team_a_DR,float64,316455,44,27321.0,24.521174,5.168359,6.0,21.0,24.0,28.0,51.0
