# Quiz On Regression

The aim of this exercise is to predict the number of games that a Major-League Baseball team won that season, based on the teams statistics and other variables from that season.

## Part A: Data Exploration

In [None]:
# import libraries
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import linear_model, metrics
from sklearn.grid_search import GridSearchCV
from matplotlib import pyplot as plt
import matplotlib

In [None]:
# import "Teams.csv" as teams_df and show the first 5 rows
teams_df = pd.read_csv('../datasets/Teams.csv')
print(teams_df.head())

In [None]:
#describe data
teams_df.describe()

### What this data means?

Each of the columns contain data related to a specific team and year. Some of the more important variables are listed below.
* yearID - Year
* teamID - Team
* franchID - Franchise (links to TeamsFranchise table)
* G - Games played
* W - Wins
* LgWin - League Champion(Y or N)
* WSWin - World Series Winner (Y or N)
* R - Runs scored
* AB - At bats
* H - Hits by batters
* HR - Homeruns by batters
* BB - Walks by batters
* SO - Strikeouts by batters
* SB - Stolen bases
* CS - Caught stealing
* HBP - Batters hit by pitch
* SF - Sacrifice flies
* RA - Opponents runs scored
* ER - Earned runs allowed
* ERA - Earned run average
* CG - Complete games
* SHO - Shutouts
* SV - Saves
* IPOuts - Outs Pitched (innings pitched x 3)
* HA - Hits allowed
* HRA - Homeruns allowed
* BBA - Walks allowed
* SOA - Strikeouts by pitchers
* E - Errors
* DP - Double Plays
* FP - Fielding percentage
* name - Team’s full name


In [None]:
# Adding column names to dataframe
cols = ['yearID','lgID','teamID','franchID','divID','Rank','G','Ghome','W','L','DivWin','WCWin','LgWin','WSWin','R','AB','H','2B','3B','HR','BB','SO','SB','CS','HBP','SF','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E','DP','FP','name','park','attendance','BPF','PPF','teamIDBR','teamIDlahman45','teamIDretro','franchID','franchName','active','NAassoc']
teams_df.columns = cols

# Print the first rows of `teams_df`
print(teams_df.head())

# Print the length of `teams_df`
print(len(teams_df))

In [None]:
# Dropping your unnecesary column variables.
drop_cols = ['lgID','franchID','divID','Rank','Ghome','L','DivWin','WCWin','LgWin','WSWin','SF','name','park','attendance','BPF','PPF','teamIDBR','teamIDlahman45','teamIDretro','franchID','franchName','active','NAassoc']
df = teams_df.drop(drop_cols, axis=1)

# Print out first rows of `df`
print(df.head())

In [None]:
# Print out the number of null values of all columns of `df`
print(df.isnull().sum(axis=0).tolist())

In [None]:
# Eliminating columns with more than 100 null values ... in this case "CS" and "BHP"
df = df.drop(['CS','HBP'], axis=1)

# Filling null values of "SO" and "DP" using median
df['SO'] = df['SO'].fillna(df['SO'].median())
df['DP'] = df['DP'].fillna(df['DP'].median())

# Print out null values of all columns of `df`
print(df.isnull().sum(axis=0).tolist())

In [None]:
# matplotlib plots inline  
%matplotlib inline

# Plotting Histogram of of "W"
plt.hist(df['W'])
plt.xlabel('Wins')
plt.title('Distribution of Wins')

plt.show()

In [None]:
# Print mean of "W"
print(df['W'].mean())

In [None]:
# Creating bins for the win column
def assign_win_bins(W):
    if W < 50:
        return 1
    if W >= 50 and W <= 69:
        return 2
    if W >= 70 and W <= 89:
        return 3
    if W >= 90 and W <= 109:
        return 4
    if W >= 110:
        return 5
    
df['win_bins'] = df['W'].apply(assign_win_bins)

In [None]:
# Filter for rows where 'yearID' is greater than 1900
df = df[df['yearID'] > 1900]

In [None]:
# Creating "year_label" column, which will give your algorithm information about how certain years are related 
# (Dead ball eras, Live ball/Steroid Eras)

def assign_label(year):
    if year < 1920:
        return 1
    elif year >= 1920 and year <= 1941:
        return 2
    elif year >= 1942 and year <= 1945:
        return 3
    elif year >= 1946 and year <= 1962:
        return 4
    elif year >= 1963 and year <= 1976:
        return 5
    elif year >= 1977 and year <= 1992:
        return 6
    elif year >= 1993 and year <= 2009:
        return 7
    elif year >= 2010:
        return 8
        
# Add `year_label` column to `df`    
df['year_label'] = df['yearID'].apply(assign_label)

dummy_df = pd.get_dummies(df['year_label'], prefix='era')

# Concatenate `df` and `dummy_df`
df = pd.concat([df, dummy_df], axis=1)

print(df.head())

In [None]:
# Convert years into decade bins and creating dummy variables
def assign_decade(year):
    if year < 1920:
        return 1910
    elif year >= 1920 and year <= 1929:
        return 1920
    elif year >= 1930 and year <= 1939:
        return 1930
    elif year >= 1940 and year <= 1949:
        return 1940
    elif year >= 1950 and year <= 1959:
        return 1950
    elif year >= 1960 and year <= 1969:
        return 1960
    elif year >= 1970 and year <= 1979:
        return 1970
    elif year >= 1980 and year <= 1989:
        return 1980
    elif year >= 1990 and year <= 1999:
        return 1990
    elif year >= 2000 and year <= 2009:
        return 2000
    elif year >= 2010:
        return 2010
    
df['decade_label'] = df['yearID'].apply(assign_decade)
decade_df = pd.get_dummies(df['decade_label'], prefix='decade')
df = pd.concat([df, decade_df], axis=1)

# Drop unnecessary columns
df = df.drop(['yearID','year_label','decade_label'], axis=1)

In [None]:
# Create new features for Runs per Game and Runs Allowed per Game
df['R_per_game'] = df['R'] / df['G']
df['RA_per_game'] = df['RA'] / df['G']

In [None]:
# Create scatter plots for runs per game vs. wins and runs allowed per game vs. wins
fig = plt.figure(figsize=(12, 6))

ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

ax1.scatter(df['R_per_game'], df['W'], c='blue')
ax1.set_title('Runs per Game vs. Wins')
ax1.set_ylabel('Wins')
ax1.set_xlabel('Runs per Game')

ax2.scatter(df['RA_per_game'], df['W'], c='red')
ax2.set_title('Runs Allowed per Game vs. Wins')
ax2.set_xlabel('Runs Allowed per Game')

plt.show()

In [None]:
df.corr()['W']

In [None]:
df.shape

## Part B: Modeling

### Create and compare different models to predict "W" in function of other inputs.

In [None]:
# Create new DataFrame using only variables to be included in models
numeric_cols = ['G','R','AB','H','2B','3B','HR','BB','SO','SB','RA','ER','ERA','CG','SHO','SV','IPouts','HA','HRA','BBA','SOA','E','DP','FP','era_1','era_2','era_3','era_4','era_5','era_6','era_7','era_8','decade_1910','decade_1920','decade_1930','decade_1940','decade_1950','decade_1960','decade_1970','decade_1980','decade_1990','decade_2000','decade_2010','R_per_game','RA_per_game']
X = df[numeric_cols]
y = df['W']

# Create X_train, X_test, y_train, y_test (25% tesing anf 75% training/validation)

# print shapes of X_train, X_test, y_train, y_test 


In [None]:
# Create 3 models: OLS, Ridge (default hyperparameters), and Lasso (default hyperparameters) 
# where you train the model on Training Data and test it on Testing Data.
# Print out the MSE and R2 values of each model.



#### Which model was the best?
Answer:

In [None]:
# What are the properties (hyperparameters) of trained Lasso Regression


In [None]:
# What are the properties (hyperparameters) of trained Ridge Regression


In [None]:
# Create Grid Search to find the best "alpha" and "normalize" hyperparameters
# for Lasso Regression. Print out the best estimators and best scores
# Use Grid Search with CV = 5


In [None]:
# Create Grid Search to find the best "alpha" and "normalize" hyperparameters
# for Ridge Regression. Print out the best estimators and best scores. 
# Use Grid Search with CV = 5


In [None]:
# Which model was the best out of OLS, Lasso, and Ridge? What was its R2 and MSE values?


In [None]:
# Plot a scatterplot of "Real Values" vs "Predicted Values" of the best performing model
