In [50]:
from collections import defaultdict
from glob import glob

import numpy as np
import pandas as pd
import patsy
import re

NAN = float('nan')
OUTPUT_PATH = "data/2/{}.csv"

# Import data

In [51]:
INPUT_PATH = "data/1/{}.csv"
GAMES_FILES = glob(INPUT_PATH.format("games_*"))

def games_df_from_files(files):
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        
        # get rid of 'Unnamed: 0' column
        if 'Unnamed: 0' in df.columns:
            df.drop(columns='Unnamed: 0', inplace=True)
            
        # set column names
        df.columns = ['team1', 'team2', 'result', 'forecast', 'details', 'wind', 'link']
        
        # add week and season
        file_arr = file.strip('.csv').split('_')
        df['week'] = int(file_arr[2])
        df['season'] = int(file_arr[1])
        
        dfs.append(df)
    return pd.concat(dfs)

# team1, team2 -> teams

In [52]:
def separate_teams(df):
    df2 = df.loc[:,('team2', 'result', 'forecast', 'details', 'wind', 'link', 'week', 'season')]
    df2.rename(index=str, columns={"team2": "team1"}, inplace=True)
    df = df.loc[:,('team1', 'result', 'forecast', 'details', 'wind', 'link', 'week', 'season')].append(df2)
    df.rename(index=str, columns={"team1": "team"}, inplace=True)
    df = df.sort_values(by=['season', 'week'])
    return df

# Weather

In [53]:
PRECIPITATION = ['rain', 'snow', 'drizzle', 'flurries']

def extract_temp(forecast):
    if forecast == 'DOME':
        return forecast
    else:
        p = re.compile("\d+")
        if p.match(forecast):
            return p.match(forecast).group()
    return '?'
    
def extract_precipitation(forecast):
    if forecast == 'DOME':
        return 0
    else:
        forecast = forecast.lower()
        for p in PRECIPITATION:
            if p in forecast:
                return 1
        return 0

def extract_wind(wind):
    p = re.compile("\d")
    if p.match(wind):
        return p.match(wind).group()
    return float('nan')
    
def prepare_weather(df):
    df['temperature'] = df['forecast'].apply(extract_temp)  # temperature
    df['precipitation'] = df['forecast'].apply(extract_precipitation)  # precipitation
    df['wind'] = np.where(df['forecast'] != 'DOME', df['wind'].apply(extract_wind), 0)  # wind magnitude    
    return df


# ... now run everything

In [64]:
games_df = games_df_from_files(GAMES_FILES)
assert len(games_df) > 0

before_len = len(games_df)
games_df = separate_teams(games_df)
assert 'team' in games_df.columns
assert 'team1' not in games_df.columns
assert 'team2' not in games_df.columns
assert len(games_df) == 2*before_len

cols = ['precipitation', 'wind', 'temperature']
games_df = prepare_weather(games_df)
for c in cols:
    assert c in games_df.columns, c

cols = ['forecast', 'result', 'details', 'link']
games_df.drop(columns=cols, inplace=True)
for c in cols:
    assert c not in games_df.columns, c

games_df.head()

Unnamed: 0,team,wind,week,season,temperature,precipitation
0,Panthers,2,1,2016,82,0
1,Buccaneers,0,1,2016,DOME,0
2,Vikings,3,1,2016,77,0
3,Browns,9,1,2016,83,0
4,Bengals,9,1,2016,80,0


In [1]:
min_s = str(games_df.season.min())
max_s = str(games_df.season.max())

name = "games_" + min_s
if max_s != min_s:
    name += "-" + max_s
    
#print("Writing file to ", OUTPUT_PATH.format(name))
#games_df.to_csv(OUTPUT_PATH.format(name))
#print("Done!")

NameError: name 'games_df' is not defined