In [2]:
import pandas as pd

# Title 

## Introduction

## Goal

## Steps
1. Data Cleaning / EDA
2. Modeling / Model Selection
3. Interpretation of Results 

## Results / Conclusions

# Code

## Data Cleaning / EDA

In [55]:
# Importing packages 
import pandas as pd 
import seaborn as sns
import numpy as np

In [20]:
# Importing data
player_data = pd.read_csv("player_data.csv")
players = pd.read_csv("Players.csv")
season_stats = pd.read_csv("Seasons_Stats.csv")

Here, we have three different datasets we can analyze and work with. Let's take a look at the player_data dataset first and explore it. 

In [69]:
# Exploring player_data dataset
player_data.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,career_length
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,4
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,9
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",19
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,10
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,5


In [68]:
# Are there any missing values? 
player_data.isna().sum()

name               0
year_start         0
year_end           0
position           1
height             1
weight             6
birth_date        31
college          302
career_length      0
dtype: int64

We will handle the missing data in a couple ways. For position, height, and weight, it should be easy to impute the data as we can use the existing data to predict the values. For example, if height is missing, but position exists, we can get the average height for that position and impute the data that way. For birth_date, we can get the average age players start in the nba and get the year. Lastly, for colleges, this will be more difficult. However, I think we can do probabilistic imputation and draw from the distribution.

In [74]:
# Looking at missing position data
player_data[player_data['position'].isna()]

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,career_length
2142,George Karl,1974,1978,,,,"May 12, 1952",University of North Carolina,4


Unfortunately, it turns out that the player has 3 missing values. In this case, I will just remove this row. Thankfully it is just one player. 

In [75]:
# Dropping row
player_data = player_data.drop(player_data.index[[2142]])

In [76]:
# Looking at missing weight data
player_data[player_data['weight'].isna()]

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,career_length
2360,Dick Lee,1968,1968,F,6-6,,,University of Washington,0
2783,Murray Mitchell,1950,1950,C,6-6,,"March 19, 1923",Sam Houston State University,0
2973,Paul Nolen,1954,1954,C,6-10,,"September 3, 1929",Texas Tech University,0
4279,Ray Wertis,1947,1948,G,5-11,,"January 1, 1922",St. John's University,1
4472,Bob Wood,1950,1950,G,5-10,,"October 7, 1921",Northern Illinois University,0


In [119]:
# Group-wise Mean Imputation
player_data.weight = player_data.weight.fillna(player_data.groupby('position')['weight'].transform('mean'))

In [142]:
# Looking at missing college data
player_data[player_data['college'].isna()].head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,career_length
9,Alex Abrines,2017,2018,G-F,6-6,190.0,"August 1, 1993",,1
32,Alexis Ajinca,2009,2017,C,7-2,248.0,"May 6, 1988",,8
38,Furkan Aldemir,2015,2015,F-C,6-10,240.0,"August 9, 1991",,0
74,David Andersen,2010,2011,C,6-11,245.0,"June 23, 1980",,1
100,Martynas Andriuskevicius,2006,2006,C,7-2,240.0,"March 12, 1986",,0


In [147]:
# Probabilistic Imputation
num_null = player_data.college.isnull().sum() # number of nulls
fill_values = player_data.college.dropna().sample(num_null, replace=True)  # draw fill vals from distribution
fill_values.index = player_data.loc[player_data.college.isnull()].index  # align the index, which is missing?
player_data = player_data.fillna({'college': fill_values.to_dict()})  # fill the vals

A couple questions come to mind from seeing our dataset. 
1. What is the average career length a player has? Does it change over year_start?
2. Which colleges do most players come from? 
3. Any change in height and weight as time progresses? 
4. Is there a common birth month with position? 

In [24]:
# What is the average career length a player has? Does it change over year_start? 
player_data['career_length'] = player_data.year_end - player_data.year_start
player_data['career_length'].value_counts().head()

0    1317
1     566
2     394
3     314
4     245
Name: career_length, dtype: int64

What stands out to me is the high frequency of 0 year careers. You would think that most careers would be at least couple years. I want to see what type of players have such a short basketball career.

In [26]:
# Dataset with only players with 0 years in career
player_data[player_data.career_length == 0].head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,career_length
7,Forest Able,1957,1957,G,6-3,180.0,"July 27, 1932",Western Kentucky University,0
11,Don Ackerman,1954,1954,G,6-0,183.0,"September 4, 1930",Long Island University,0
13,Bud Acton,1968,1968,F,6-6,210.0,"January 11, 1942",Hillsdale College,0
23,Bam Adebayo,2018,2018,C-F,6-10,243.0,"July 18, 1997",University of Kentucky,0
31,Matthew Aitch,1968,1968,F,6-7,230.0,"September 21, 1944",Michigan State University,0


I want to see if their age when they started in the NBA was a factor. And if the number of players with 0 years decreases over the years.

In [27]:
# Creating dataframe with players with 0 years
copy = player_data.copy(deep=True)
zero_years = copy[copy.career_length == 0]

In [54]:
# Extracting Birth year and finding the difference with year_start 
zero_years.birth_date.isna().sum()

30

In [66]:
# Since we have null values, we will temporarily ignore them 
birth_year = zero_years.birth_date.str.split(",")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [134]:
players

Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky
...,...,...,...,...,...,...,...,...
3917,3917,Troy Williams,198.0,97.0,South Carolina State University,1969.0,Columbia,South Carolina
3918,3918,Kyle Wiltjer,208.0,108.0,Gonzaga University,1992.0,Portland,Oregon
3919,3919,Stephen Zimmerman,213.0,108.0,"University of Nevada, Las Vegas",1996.0,Hendersonville,Tennessee
3920,3920,Paul Zipser,203.0,97.0,,1994.0,Heidelberg,Germany
