In [12]:
#importing libraries
import numpy as np
import pandas as pd

In [13]:
#display changes for visibility
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100

In [14]:
#reading csv file created in the process of scrapping with "elf plays scrap"
df = pd.read_csv('elf_plays_2022_scrap.csv')

In [15]:
#dropping the unnamed column for index
df = df.drop(['Unnamed: 0'],axis=1)

In [16]:
#creating dictionaries for some abbreviations replacements in the next steps
teams_dict = {'CC':'Cologne Centurions','IR':'Istanbul Rams','LK':'Leipzig Kings',
                'PW':'Panthers Wroclaw','FG':'Frankfurt Galaxy','RF':'Rhein Fire',
                'VV':'Vienna Vikings','RT':'Raiders Tirol','BD':'Barcelona Dragons',
                'SS':'Stuttgart Surge','BT':'Berlin Thunder','HD':'Hamburg Sea Devils'}
penalty_dict = {'IS':'IS','OD':'OD','HO':'HO','ILF':'Illegal Forward Handling','DOF':'Defensive Offside',
                'DOG':'Delay of Game','FMM':'Facemask','ICT':'Illegal Contact','ENC':'Encroachment',
                'ILH':'Illegal Use of Hands','PR':'PR','RPS':'Roughing the Passer',
                'FST':'False Start','DPI':'Defensive Pass Interference','OH':'Offensive Holding',
                'DH':'Defensive Holding','UNR':'Unnecessary Roughness','BLI':'Illegal Blindside Block',
                'false start':'False Start'}

In [17]:
#creating additional columns based on the source data
df['game_id'] = df['game_date'].astype(str)+df['away_team'].str.split("").str[:4].str.join("")+df['home_team'].str.split("").str[:4].str.join("")
df['passer'] = np.where(df['play'].str.contains('Pass'), df["situation"].str.split(" ").str[:2].str.join(" "), np.nan)
df['receiver'] = df['situation'].str.extract(r'(?<=pass complete to\s)(\S*\s\S*)')
df['intended_receiver'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass incomplete'))), 
                           df["situation"].str.split(" ").str[5:7].str.join(" "),
                            np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass complete'))), 
                           df["situation"].str.split(" ").str[5:7].str.join(" "), np.nan))
df['rusher'] = np.where((df['play'].str.contains('Rush')&(df['situation'].str.contains('rush'))), 
                           df["situation"].str.split(" ").str[0:2].str.join(" "), np.nan)
df['rec_yds'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('sacked'))),np.nan,
                         np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('no gain'))),0,
                         np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('loss of'))), 
                           "-"+df["situation"].str.extract(r'(?<=for loss of\s)(\S*)',expand=False),
                            np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass complete'))), 
                           df["situation"].str.extract(r'(?<=for\s)(\S*)',expand=False), np.nan))))
df['rush_yds'] = np.where((df['situation'].str.contains('rush for loss')), 
                           "-"+df["situation"].str.extract(r'(?<=rush for loss of\s)(\S*)',expand=False),
                            np.where((df['situation'].str.contains('rush for no gain')), 0,
                           np.where((df['play'].str.contains('Rush')),
                            df["situation"].str.extract(r'(?<=rush for\s)(\S*)',expand=False),np.nan)))
df['punt_yds'] = np.where((df['play'].str.contains('Punt')), 
                           df["situation"].str.split(" ").str[3:4].str.join(" "), np.nan)
df['kickoff_player'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.split(" ").str[:2].str.join(" "), np.nan)
df['kickoff_yds'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.split(" ").str[3:4].str.join(" "), np.nan)
df['kickoff_returner'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.extract(r'(\S*\s\S*)(?=\sreturn)',expand=False),np.nan)
df['kickoff_ret_yds'] = np.where((df['play'].str.contains('Kickoff')), 
                           df["situation"].str.extract(r'(?<=return\s)(\S*)',expand=False),np.nan)
df['def_player_action'] = np.where((df['play'].str.contains('Penalty')),np.nan,df['situation']
                            .str.extract(r"\((.*?)\)", expand=False).str.replace(";",','))
df['int'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('intercepted'))), 1, 0)
df['int_player'] = np.where((df['situation'].str.contains('intercepted')),
                            df["situation"].str.extract(r'(?<=intercepted by\s)(\S*\s\S*)',expand=False),np.nan)
df['int_ret_yds'] = np.where((df['situation'].str.contains('intercepted')),
                            df["situation"].str.extract(r'(?<=return\s)(\S*)',expand=False),np.nan)
df['sack'] = np.where((df['situation'].str.contains('sack')), 1, 0)
df['sack_yds'] = np.where((df['situation'].str.contains('sacked for loss of')), 
                           "-"+df["situation"].str.split(" ").str[6:7].str.join(" "),np.nan)
df['td'] = np.where((df['situation'].str.contains('TOUCHDOWN')), 1, 0)
df['pat'] = np.where((df['play'].str.contains('Point after try')), 1, 0)
df['fg'] = np.where((df['play'].str.contains('Field goal attempt')&(df['situation'].str.contains('GOOD'))), 1, 0)
df['drive_num'] = df['drive_num'].str.split("_").str[1:].str.join(" ")
df['situation'] = df['situation'].str.replace(";",',')
df['timeout'] = np.where((df['play'].str.contains('Timeout')),1,0)
df['penalty'] = np.where((df['play'].str.contains('Penalty')),1,0)
df['penalty_type'] = np.where((df['play'].str.contains('Penalty')),
                              df["situation"].str.extract(r"^(.+?) ?(?:\d|\(|$)", expand=False),np.nan)
df['penalty_type'] = df['penalty_type'].str.split(" ").str[2:].str.join(" ")
df['penalty_team'] = np.where((df['play'].str.contains('Penalty')),
                               df["situation"].str.split(" ").str[1:2].str.join(" "),np.nan)
df['penalty_player'] = np.where((df['play'].str.contains('Penalty')),
                               df['situation'].str.extract(r"\((.*?)\)", expand=False).str.replace(";",','),np.nan)
df['pass_comp'] = np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass incomplete'))),
                    0,np.where((df['play'].str.contains('Pass')&(df['situation'].str.contains('pass complete'))),
                   1, np.nan))
df['fumble'] = np.where((df['situation'].str.contains('fumble')), 1, 0)
df['fum_player'] = np.where((df['situation'].str.contains('fumble')),
                    df['situation'].str.extract(r'(?<=fumble by\s)(\S*\s\S*)',expand=False),np.nan)
df['fum_force_player'] = np.where((df['situation'].str.contains('fumble')),
                    df['situation'].str.extract(r'(?<=forced by\s)(\S*\s\S*)',expand=False),np.nan)
df['fum_recov_player'] = np.where((df['situation'].str.contains('fumble')),
                        df['situation'].str.extract(r'(?<=recovered by\s\S\S\s)(\S*\s\S*)',expand=False),np.nan)
df['fum_recov_team'] = np.where((df['situation'].str.contains('fumble')),
                                df['situation'].str.extract(r'(?<=recovered by\s)(\S*)',expand=False),np.nan)

In [18]:
#using replace for some columns with dictionaries created earlier
df.replace({"penalty_team": teams_dict}, inplace=True)
df.replace({"fum_recov_team": teams_dict}, inplace=True)
df.replace({"penalty_type": penalty_dict}, inplace=True)

In [20]:
#zapisz do csv
df.to_csv('elf_plays_2022.csv')