In [None]:
#importing libraries
import numpy as np
import pandas as pd

In [None]:
#display changes for visibility
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100

In [None]:
#reading csv file created in the process of scrapping with "elf plays scrap"
df_play = pd.read_csv('elf_plays_2022_scrap.csv')
df_game_info = pd.read_csv('elf_schedule_scrap.csv')

In [None]:
#dropping the unnamed column for index
df_play = df_play.drop(['Unnamed: 0'],axis=1)
df_game_info = df_game_info.drop(['Unnamed: 0'],axis=1)

In [None]:
#creating game_id column in play by play df which will be used to merge dfs
df_play['game_id'] = df_play['game_date'].astype(str).str.split("").str[0:5].str.join("")+df_play['away_team'].str.split("").str[:4].str.join("")+df_play['home_team'].str.split("").str[:4].str.join("")

In [None]:
#merging dfs
df = df_play.merge(df_game_info, on='game_id', how='left')
#dropping duplicate columns left after merge and renaming unique ones (getting rid of "_y") 
df = df.drop(['away_team_x','home_team_x'], axis=1)
df = df.rename(columns={'away_team_y': 'away_team', 'home_team_y': 'home_team', 'yds_to_go':'yds_to_1st_down'})

In [None]:
#creating additional columns with data
df['drive_num'] = df['drive_num'].str.split("_").str[1:].str.join(" ")
df['situation'] = df['situation'].str.replace(";",',')
df['passer'] = np.where(df['play'].str.contains('Pass'), df["situation"].str.split(" ").str[:2].str.join(" "), np.nan)
df['receiver'] = df['situation'].str.extract(r'(?<=pass complete to\s)(\S*\s\S*)')
df['intended_receiver'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass incomplete'))), 
                           df["situation"].str.split(" ").str[5:7].str.join(" "),
                            np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass complete'))), 
                           df["situation"].str.split(" ").str[5:7].str.join(" "), np.nan))
df['pass_attempt'] = np.where((df['play'].str.contains('Pass')),1,0)
df['pass_comp'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass complete'))),1,0)
df['passing_yds'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('incomplete'))),0,
                np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('no gain'))),0,
                np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('loss of'))), 
                "-"+df["situation"].str.extract(r'(?<=for loss of\s)(\S*)',expand=False),
                np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass complete'))), 
                df["situation"].str.extract(r'(?<=for\s)(\S*)',expand=False), np.nan))))
df['rusher'] = np.where((df['play'].str.contains('Rush')&(df['situation'].str.contains('rush'))), 
                           df["situation"].str.split(" ").str[0:2].str.join(" "), np.nan)
df['rush_yds'] = np.where((df['situation'].str.contains('rush for loss')), 
                           "-"+df["situation"].str.extract(r'(?<=rush for loss of\s)(\S*)',expand=False),
                            np.where((df['situation'].str.contains('rush for no gain')), 0,
                           np.where((df['play'].str.contains('Rush')),
                            df["situation"].str.extract(r'(?<=rush for\s)(\S*)',expand=False),np.nan)))
df['yds_gained_per_play'] = pd.to_numeric(df['rush_yds'].fillna(0))+pd.to_numeric(df['passing_yds'].fillna(0))
df['td'] = np.where((df['situation'].str.contains('TOUCHDOWN')), 1, 0)
df['pass_td'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('TOUCHDOWN'))), 1, 0)
df['rush_td'] = np.where((df['play'].str.contains('Rush')&(df['situation'].str.contains('TOUCHDOWN'))), 1, 0)
df['extra_point_attempt'] = np.where((((df['play'].str.contains('Point after try')&
            (df['situation'].str.contains('kick')))|(df['situation'].str.contains('kick attempt')))),1,0)
df['extra_point_result'] = np.where((df['situation'].str.contains('kick attempt failed')),'failed',
                    np.where((df['situation'].str.contains('kick attempt failed (blocked)')),'blocked',
                    np.where((df['situation'].str.contains('kick attempt failed (fumbled)')),'fumbled',
                    np.where((df['situation'].str.contains('kick attempt good')),'good',np.nan))))
#df['2pt_attempt'] =   
#df['2pt_attempt_result'] =                              
df['fg_attempt'] = np.where((df['play'].str.contains('Field goal attempt')),1,0)
df['fg_attempt_result'] = np.where((df['play'].str.contains('Field goal')&(df['situation'].str.contains('GOOD'))),'good',
        np.where((df['play'].str.contains('Field goal')&(df['situation'].str.contains('MISSED'))),'missed',
        np.where((df['play'].str.contains('Field goal')&(df['situation'].str.contains('BLOCKED'))),'blocked',np.nan)))
df['fg_attempt_distance'] = np.where((df['play'].str.contains('Field goal attempt')),
               (df["situation"].str.extract(r'(?<=attempt from\s)(\S*)',expand=False)),0)
df['fg_kicker'] = np.where((df['play'].str.contains('Field goal')),
                            df["situation"].str.extract(r'(\S*\s\S*)(?=\sfield)',expand=False),np.nan) 
df['kickoff_player'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.split(" ").str[:2].str.join(" "), np.nan)
df['kickoff_yds'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.split(" ").str[3:4].str.join(" "), np.nan)
df['kickoff_returner'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.extract(r'(\S*\s\S*)(?=\sreturn)',expand=False),np.nan)
df['kickoff_ret_yds'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.extract(r'(?<=return\s)(\S*)',expand=False),np.nan)
df['punt'] = np.where((df['play'].str.contains('Punt')),1,0)
df['punt_yds'] = np.where((df['play'].str.contains('Punt')), 
                          df["situation"].str.extract(r"(?<=punt\s)(\S*)", expand=False),0)
df['punter'] = np.where((df['play'].str.contains('Punt')), 
                          df["situation"].str.extract(r"(\S*\s\S*)(?=\spunt)", expand=False),np.nan)
df['timeout'] = np.where((df['play'].str.contains('Timeout')),1,0)
df['def_player_action'] = np.where((df['play'].str.contains('Penalty')),np.nan,df['situation']
                            .str.extract(r"\((.*?)\)", expand=False).str.replace(";",','))
df['int'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('intercepted'))), 1, 0)
df['int_player'] = np.where((df['situation'].str.contains('intercepted')),
                            df["situation"].str.extract(r'(?<=intercepted by\s)(\S*\s\S*)',expand=False),np.nan)
df['int_ret_yds'] = np.where((df['situation'].str.contains('intercepted')),
                            df["situation"].str.extract(r'(?<=return\s)(\S*)',expand=False),np.nan)
df['sack'] = np.where((df['situation'].str.contains('sack')), 1, 0)
df['sack_yds'] = np.where((df['situation'].str.contains('sacked for loss of')), 
                           "-"+df["situation"].str.split(" ").str[6:7].str.join(" "),np.nan)
df['fumble'] = np.where((df['situation'].str.contains('fumble')), 1, 0)
df['fum_player'] = np.where((df['situation'].str.contains('fumble')),
                            df['situation'].str.extract(r'(?<=fumble by\s)(\S*\s\S*)',expand=False),np.nan)
df['fum_force_player'] = np.where((df['situation'].str.contains('fumble')),
                    df['situation'].str.extract(r'(?<=forced by\s)(\S*\s\S*)',expand=False),np.nan)
df['fum_recov_player'] = np.where((df['situation'].str.contains('fumble')),
                        df['situation'].str.extract(r'(?<=recovered by\s\S\S\s)(\S*\s\S*)',expand=False),np.nan)
df['fum_recov_team'] = np.where((df['situation'].str.contains('fumble')),
                                df['situation'].str.extract(r'(?<=recovered by\s)(\S*)',expand=False),np.nan)
df['penalty'] = np.where((df['play'].str.contains('Penalty')),1,0)
df['penalty_type'] = np.where((df['play'].str.contains('Penalty')),
                              df["situation"].str.extract(r"^(.+?) ?(?:\d|\(|$)", expand=False),np.nan)
df['penalty_type'] = df['penalty_type'].str.split(" ").str[2:].str.join(" ")
df['penalty_yds'] =  np.where((df['play'].str.contains('Penalty')),
                            df["situation"].str.extract(r'(\S*)(?=\syards)',expand=False),0)
df['penalty_team'] = np.where((df['play'].str.contains('Penalty')),
                               df["situation"].str.split(" ").str[1:2].str.join(" "),np.nan)
df['penalty_player'] = np.where((df['play'].str.contains('Penalty')),
                               df['situation'].str.extract(r"\((.*?)\)", expand=False).str.replace(";",','),np.nan)

In [None]:
#creating dictionaries for some abbreviations replacements
teams_dict = {'CC':'Cologne Centurions','IR':'Istanbul Rams','LK':'Leipzig Kings',
                'PW':'Panthers Wroclaw','FG':'Frankfurt Galaxy','RF':'Rhein Fire',
                'VV':'Vienna Vikings','RT':'Raiders Tirol','BD':'Barcelona Dragons',
                'SS':'Stuttgart Surge','BT':'Berlin Thunder','HD':'Hamburg Sea Devils'}
penalty_dict = {'IS':'IS','OD':'OD','HO':'HO','ILF':'Illegal Forward Handling','DOF':'Defensive Offside',
                'DOG':'Delay of Game','FMM':'Facemask','ICT':'Illegal Contact','ENC':'Encroachment',
                'ILH':'Illegal Use of Hands','PR':'PR','RPS':'Roughing the Passer',
                'FST':'False Start','DPI':'Defensive Pass Interference','OH':'Offensive Holding',
                'DH':'Defensive Holding','UNR':'Unnecessary Roughness','BLI':'Illegal Blindside Block',
                'false start':'False Start'}
quarter_dict = {'1st Quarter':'1','2nd Quarter':'2','3rd Quarter':'3','4th Quarter':'4'}
id_dict = {'Wroclaw':"Panthers"}
#using replace for some columns with dictionaries created earlier
df.replace({"penalty_team":teams_dict}, inplace=True)
df.replace({"fum_recov_team":teams_dict}, inplace=True)
df.replace({"penalty_type":penalty_dict}, inplace=True)
df.replace({"qtr":quarter_dict}, inplace=True)
df.replace({"home_team":id_dict,"away_team":id_dict}, inplace=True)

In [None]:
#save to csv
df.to_csv('elf_plays_2022.csv')