This file includes steps I performed to investigate if there is something outstanding in the sample we have.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_1samp
pd.set_option('max_columns', 100)

In [2]:
wnba = pd.read_csv('..\\data\\wnba_clean.csv')
wnba.drop('Unnamed: 0', axis=1, inplace=True)
wnba.head()

Unnamed: 0,Name,Team,Pos,Height,Weight,BMI,Birth_Place,Birthdate,Age,College,Experience,Games Played,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TO,PTS,DD2,TD3
0,Aerial Powers,DAL,F,183,71,21.200991,US,"January 17, 1994",23,Michigan State,2,8,173,30,85,35.3,12,32,37.5,21,26,80.8,6,22,28,12,3,6,12,93,0,0
1,Alana Beard,LA,G/F,185,73,21.329438,US,"May 14, 1982",35,Duke,12,30,947,90,177,50.8,5,18,27.8,32,41,78.0,19,82,101,72,63,13,40,217,0,0
2,Alex Bentley,CON,G,170,69,23.875433,US,"October 27, 1990",26,Penn State,4,26,617,82,218,37.6,19,64,29.7,35,42,83.3,4,36,40,78,22,3,24,218,0,0
3,Alex Montgomery,SAN,G/F,185,84,24.543462,US,"December 11, 1988",28,Georgia Tech,6,31,721,75,195,38.5,21,68,30.9,17,21,81.0,35,134,169,65,20,10,38,188,2,0
4,Alexis Jones,MIN,G,175,78,25.469388,US,"August 5, 1994",23,Baylor,R,24,137,16,50,32.0,7,20,35.0,11,12,91.7,3,9,12,12,7,0,14,50,0,0


In [3]:
# Let's replare R with 0 to be able to range experience easily
wnba['Experience'] = np.where(wnba['Experience'] == 'R', 0, wnba['Experience'])
wnba['Experience'].unique()

array(['2', '12', '4', '6', 0, '8', '5', '3', '1', '9', '10', '11', '7',
       '13', '14', '15'], dtype=object)

In [4]:
# Player with maximum points
wnba.loc[wnba['PTS'] == wnba['PTS'].max()]

Unnamed: 0,Name,Team,Pos,Height,Weight,BMI,Birth_Place,Birthdate,Age,College,Experience,Games Played,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TO,PTS,DD2,TD3
19,Breanna Stewart,SEA,F/C,193,77,20.671696,US,"August 27, 1994",22,Connecticut,2,29,952,201,417,48.2,46,123,37.4,136,171,79.5,43,206,249,78,29,47,68,584,8,0


In [5]:
# Top 5 players by points overall the sample
wnba.sort_values(['PTS'], ascending=False).head()

Unnamed: 0,Name,Team,Pos,Height,Weight,BMI,Birth_Place,Birthdate,Age,College,Experience,Games Played,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TO,PTS,DD2,TD3
19,Breanna Stewart,SEA,F/C,193,77,20.671696,US,"August 27, 1994",22,Connecticut,2,29,952,201,417,48.2,46,123,37.4,136,171,79.5,43,206,249,78,29,47,68,584,8,0
140,Tina Charles,NY,F/C,193,84,22.550941,US,"May 12, 1988",29,Connecticut,8,29,952,227,509,44.6,18,56,32.1,110,135,81.5,56,212,268,75,21,22,71,582,11,0
102,Nneka Ogwumike,LA,F,188,79,22.351743,US,"February 7, 1990",27,Stanford,6,30,948,215,386,55.7,18,49,36.7,129,148,87.2,57,179,236,63,53,14,47,577,9,0
130,Sylvia Fowles,MIN,C,198,96,24.487297,US,"June 10, 1985",32,LSU,10,29,895,222,336,66.1,0,0,0.0,128,162,79.0,113,184,297,39,39,61,71,572,16,0
123,Skylar Diggins-Smith,DAL,G,175,66,21.55102,US,"February 8, 1990",27,Notre Dame,4,30,1018,167,394,42.4,43,119,36.1,168,186,90.3,21,86,107,173,38,24,83,545,1,0


In [6]:
# Top college by number of players in WNBA
wnba['College'].groupby(wnba['College']).count().sort_values(ascending=False)

College
Connecticut      14
Maryland          8
Duke              7
Rutgers           7
Notre Dame        6
                 ..
Kansas State      1
Japan             1
James Madison     1
Penn State        1
Xavier            1
Name: College, Length: 65, dtype: int64

In [7]:
# Top 5 players by points from Connecticut college
wnba.loc[wnba['College'] == 'Connecticut'].sort_values(['PTS'], ascending=False).head()

Unnamed: 0,Name,Team,Pos,Height,Weight,BMI,Birth_Place,Birthdate,Age,College,Experience,Games Played,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TO,PTS,DD2,TD3
19,Breanna Stewart,SEA,F/C,193,77,20.671696,US,"August 27, 1994",22,Connecticut,2,29,952,201,417,48.2,46,123,37.4,136,171,79.5,43,206,249,78,29,47,68,584,8,0
140,Tina Charles,NY,F/C,193,84,22.550941,US,"May 12, 1988",29,Connecticut,8,29,952,227,509,44.6,18,56,32.1,110,135,81.5,56,212,268,75,21,22,71,582,11,0
93,Maya Moore,MIN,F,183,80,23.888441,US,"November 6, 1989",27,Connecticut,7,29,904,170,398,42.7,52,132,39.4,98,114,86.0,50,106,156,99,53,13,56,490,3,0
137,Tiffany Hayes,ATL,G,178,70,22.09317,US,"September 20, 1989",27,Connecticut,6,29,861,144,331,43.5,43,112,38.4,136,161,84.5,28,89,117,69,37,8,50,467,0,0
124,Stefanie Dolson,CHI,C,196,97,25.249896,US,"August 1, 1992",25,Connecticut,3,28,823,162,293,55.3,24,60,40.0,50,58,86.2,35,121,156,65,14,37,65,398,3,0


It looks like Connecticut college contribute the the most into WNBA in terms of number of players.
Let's formulate the theory we want to test:
    * Basketball player will have higher chances to get into WNBA if she go to Connecticute college.
In this case our null hypothesis will be:
    * Connecticut college has the same amount of players in WNBA as other colleges.

In [8]:
# Array we're going to test is number of players in WNBA per college.
arr = wnba['College'].groupby(wnba['College']).count().values

# Mean of our hypothesis is just a number of players from Connecticut.
m = len(wnba.loc[wnba['College'] == 'Connecticut'])
ttest_1samp(arr, m)

# P-value is almost a zero which means we should reject our null hypothesis, and statistics values says that the real mean
# of our array is much less than the mean of Connecticut college.

Ttest_1sampResult(statistic=-43.29767794438227, pvalue=3.996850217172258e-49)