In [1]:
import time as t
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
def all_pros_per_year_per_team(df_all_pro):
    return df_all_pro.groupby(['Tm', 'year']).size().reset_index(name='counts')

def get_mvp_team_year(df_mvp_data):
    df_filt = df_mvp_data.query('Year >= 2000 & Year < 2022')
    return df_filt[['Year', 'Tm']]

def proc_season(df_season_data):
    numeric_cols = list(df_season_copy.columns)[1:]
    df_season_data["W-L%"] = df_season_data["W-L%"].apply(convert_percentage)
    df_season_data[numeric_cols] = df_season_data[numeric_cols].apply(pd.to_numeric)
    df_season_data['Tm'] = df_season_data['Tm'].apply(replace_team)
    return df_season_data

def proc_season_val(df_season_copy, df_season_data):
    numeric_cols = list(df_season_data.columns)[1:]
    df_season_data["W-L%"] = df_season_data["W-L%"].apply(convert_percentage)
    df_season_data[numeric_cols] = df_season_data[numeric_cols].apply(pd.to_numeric)
    df_season_data['division_winner'] = df_season_data['Tm'].str.contains("*", regex=False)
    df_season_data['wildcard'] = df_season_data['Tm'].str.contains("+", regex=False)
    df_season_data['Tm'] = df_season_data['Tm'].apply(replace_team)
    return df_season_data

def replace_team(val):
    return val.replace('*', '').replace('+', '')

def convert_percentage(val):
    return '0' + str(val)

def proc_team_name(t):
    new_str = re.sub(r'[\W_]', '', t)
    return new_str.lower()

def fix_team_names(t):
    if 'oaklandraiders' in t:
        return 'lasvegasraiders'
    if 'sandiegochargers' in t:
        return 'losangeleschargers'
    if 'stlouisrams' in t:
        return 'losangelesrams'
    if 'washingtonfootballteam' in t or 'washingtonredskins' in t:
        return 'washingtoncommanders'
    return t
    
def list_diff(df_big, df_small):
    all_teams = set(list(df_big['Tm']))
    team_names = set(list(df_small['Name']))
    return all_teams - team_names

def proc_all(season, df_season, df_teams, df_records):
    df_season = df_season[df_season['year'] == season]
    df_season = df_season.reset_index(drop=True)
    df_season['rank'] = ((df_season.index.values) % 4) + 1
    df_season_filtered = df_season[df_season['Tm'].str.contains("AFC|NFC") == False].drop(columns=['T'])
    df_season_copy = df_season_filtered.copy()
    df_season_proc = proc_season_val(df_season_copy, df_season_filtered)
    df_teams['Name'] = df_teams['Name'].apply(proc_team_name)
    df_season_proc['Tm'] = df_season_proc['Tm'].apply(proc_team_name).apply(fix_team_names)
    df_season_merged = pd.merge(df_season_proc, df_teams, left_on='Tm', right_on='Name')
    df_season_merged.drop(columns=['Tm', 'Name'], inplace=True)
    df_season_all_pro = pd.merge(df_season_merged, all_pros_per_year_per_team(df_allpro), left_on=['Alias', 'year'], right_on=['Tm', 'year'], how='left')
    df_season_all_pro['counts'] = df_season_all_pro['counts'].fillna(0)
    df_records['team'] = df_records['team'].str.upper()
    df_season_all_pro_records = pd.merge(df_season_all_pro, df_records, left_on=['Alias', 'year'], right_on=['team', 'year'], how='left')
    return df_season_all_pro_records.dropna(subset=['record'])

def get_per_season(season, df_proc):
    df_s = df_proc[df_proc['year'] == season]
    df_s_c = df_s.reset_index()
    df_s_c['rank'] = ((df_s_c.index.values) % 16) + 1
    return df_s_c

In [3]:
def create_file(name, season):
    res = proc_all(season, df_season, df_teams, df_records)
    res.to_csv(name + '.csv', index=False)

In [4]:
df_season = pd.read_csv('seasons_2000_2022.csv')
df_mvp = pd.read_csv('nfl_mvp.csv')
df_allpro = pd.read_csv('probowl_2000_2022.csv')
df_teams = pd.read_csv('teams.csv')
df_records = pd.read_csv('team_records_playoffs_inc_2020_2021.csv')

In [5]:
create_file('2021_data', 2021)