## [NYCDSA Capstone Project] 
# Women's Softball League Power Ranking Estimate

<br>
Koeun Lim (ke.lim.kang@gmail.com)<br>
Kevin Haghi (kevin.haghi@gmail.com)<br>


# Step 2. Clean Data

---
## Project Description



### Project Outline
- Step 1. Web scraping
- Step 2. Clean data
- Step 3. EDA
- Step 4. Modeling

In [31]:
import os, sys
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from matplotlib import pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 5)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 250)

In [32]:
college_inc = pd.read_csv('Data/InconsistentNames.csv')
college_inc

Unnamed: 0,Team Name,NCAA Name
0,Albany,Albany (NY)
1,App Statete,Appalachian State
2,Army,Army West Point
3,Bakersfield,CSU Bakersfield
4,Boston,Boston U
5,Boston U.,Boston U
6,Cal State Bakersfield,CSU Bakersfield
7,Cal State Fullerton,CSU Fullerton
8,Cal State Northridge,CSUN
9,Central Arkansas,Central Ark.


In [33]:
# Read all RPI rankings
root = "Data/RPI"
df_RPI = pd.DataFrame();
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith(".csv"):
            print(os.path.join(path, name))
            rpi = pd.read_csv(os.path.join(path, name))
            rpi['Year'] = int(os.path.join(path, name)[-8:-4])
            df_RPI = df_RPI.append(rpi,ignore_index = True)
            
print(df_RPI.shape)
df_RPI['College'] = df_RPI['College'].str.replace('St.','State').astype(str).str.rstrip()
df_RPI = df_RPI.rename(columns = {'Ranking':'RPI_Ranking'})

for idx_row, college in enumerate(df_RPI.College):
    if len(college_inc[college_inc['Team Name'] == college]) > 0:
        df_RPI.loc[idx_row,'College'] = college_inc[college_inc['Team Name'] == college]['NCAA Name'].iloc[0]
        
df_RPI.head(5)

Data/RPI/RPI_Rankings_2019.csv
Data/RPI/RPI_Rankings_2018.csv
Data/RPI/RPI_Rankings_2016.csv
Data/RPI/RPI_Rankings_2017.csv
Data/RPI/RPI_Rankings_2015.csv
Data/RPI/RPI_Rankings_2014.csv
Data/RPI/RPI_Rankings_2013.csv
(2062, 9)


  del sys.path[0]


Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year
0,1,Oklahoma,Big 12,57-6-0,20-0-0,19-5-0,18-1-0,0-0-0,2019
1,2,UCLA,Pac-12,56-6-0,14-1-0,19-1-0,23-4-0,0-0-0,2019
2,3,Washington,Pac-12,52-9-0,16-1-0,18-5-0,18-3-0,0-0-0,2019
3,4,Arizona,Pac-12,48-14-0,16-2-0,6-5-0,26-7-0,0-0-0,2019
4,5,Florida State,ACC,55-10-0,14-6-0,9-0-0,32-4-0,0-0-0,2019


In [34]:
# Read all WCWS (Women's College World Series) rankings
root = "Data/WCWSresults"
df_WCWS = pd.DataFrame();
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith(".csv"):
            print(os.path.join(path, name))
            wcws = pd.read_csv(os.path.join(path, name))
            wcws['Year'] = int(os.path.join(path, name)[-16:-12])
            wcws['College'] = wcws['College'].astype(str).str.rstrip()
            df_WCWS = df_WCWS.append(wcws,ignore_index = True)

for idx_row, college in enumerate(df_WCWS.College):
    if len(college_inc[college_inc['Team Name'] == college]) > 0:
        df_WCWS.loc[idx_row,'College'] = college_inc[college_inc['Team Name'] == college]['NCAA Name'].iloc[0]
 
print(df_WCWS.shape)
df_WCWS

Data/WCWSresults/WCWS2019rankings.csv
Data/WCWSresults/WCWS2015rankings.csv
Data/WCWSresults/WCWS2016rankings.csv
Data/WCWSresults/WCWS2013rankings.csv
Data/WCWSresults/WCWS2014rankings.csv
Data/WCWSresults/WCWS2018rankings.csv
Data/WCWSresults/WCWS2017rankings.csv
(448, 3)


Unnamed: 0,Rank,College,Year
0,1,UCLA,2019
1,2,Oklahoma,2019
2,3,Washington,2019
3,3,Alabama,2019
4,5,Oklahoma State,2019
...,...,...,...
443,49,Lehigh,2017
444,49,Ohio State,2017
445,49,Texas Southern,2017
446,49,Albany (NY),2017


In [35]:
# Read all NCAA stats
root = "Data/NCAAstats"
df_NCAA = pd.DataFrame();
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith(".csv"):
            print(os.path.join(path, name))
            ncaa = pd.read_csv(os.path.join(path, name))
            new_col_name = os.path.join(path, name)[15:-9]
            ncaa[new_col_name] = ncaa.iloc[:,-1]
            ncaa['Year'] = int(os.path.join(path, name)[-8:-4])
            df_NCAA = df_NCAA.append(ncaa[['College','Year',new_col_name]],ignore_index = True)

df_NCAA = df_NCAA.groupby(['College','Year']).max()
df_NCAA.reset_index(inplace=True)
df_NCAA['College'] = df_NCAA['College'].str.replace('St.','State').astype(str).str.rstrip()

for idx_row, college in enumerate(df_NCAA.College):
    if len(college_inc[college_inc['Team Name'] == college]) > 0:
        df_NCAA.loc[idx_row,'College'] = college_inc[college_inc['Team Name'] == college]['NCAA Name'].iloc[0]
 
print(df_NCAA.shape)
df_NCAA.head()

Data/NCAAstats/Fielding_Percentage_2019.csv
Data/NCAAstats/Scoring_2014.csv
Data/NCAAstats/Scoring_2015.csv
Data/NCAAstats/Fielding_Percentage_2018.csv
Data/NCAAstats/WL_Percentage_2018.csv
Data/NCAAstats/Scoring_2017.csv
Data/NCAAstats/Scoring_2016.csv
Data/NCAAstats/WL_Percentage_2019.csv
Data/NCAAstats/Scoring_2013.csv
Data/NCAAstats/Hit_Batters_2015.csv
Data/NCAAstats/Hit_Batters_2017.csv
Data/NCAAstats/Earned_Run_Average_2019.csv
Data/NCAAstats/Earned_Run_Average_2018.csv
Data/NCAAstats/Hit_Batters_2016.csv
Data/NCAAstats/Triples_per_Game_2019.csv
Data/NCAAstats/Home_Runs_per_game_2018.csv
Data/NCAAstats/Slugging_Percentage_2013.csv
Data/NCAAstats/Home_Runs_per_game_2019.csv
Data/NCAAstats/Triples_per_Game_2018.csv
Data/NCAAstats/Double_Plays_per_Game_2019.csv
Data/NCAAstats/On_Base_Percentage_2019.csv
Data/NCAAstats/Stolen_Bases_per_Game_2019.csv
Data/NCAAstats/Stolen_Bases_per_Game_2018.csv
Data/NCAAstats/On_Base_Percentage_2018.csv
Data/NCAAstats/Double_Plays_per_Game_2018.csv


  app.launch_new_instance()


(2045, 18)


Unnamed: 0,College,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings
0,A&M-Corpus Christi,2013,0.949,2.68,0.22,,5.0,0.1,0.32,0.306,0.28,,0.42,,0.231,0.8,,
1,A&M-Corpus Christi,2014,0.949,3.27,0.15,,5.68,0.15,0.48,,0.29,,0.73,,0.265,1.1,,
2,A&M-Corpus Christi,2015,0.958,3.47,0.418,,3.44,0.12,0.47,0.364,0.29,,0.8,,0.267,1.0,,
3,A&M-Corpus Christi,2016,0.961,2.23,0.375,,3.45,0.02,0.17,0.276,0.19,,0.48,,0.228,0.65,,
4,A&M-Corpus Christi,2017,0.951,2.02,0.25,,3.86,0.04,0.19,0.256,0.27,,0.65,,0.215,0.4,,


In [36]:
%store df_RPI
%store df_WCWS
%store df_NCAA

Stored 'df_RPI' (DataFrame)
Stored 'df_WCWS' (DataFrame)
Stored 'df_NCAA' (DataFrame)


In [37]:
df_RPI.to_csv('Data/RPI_final.csv',index=False)
df_WCWS.to_csv('Data/WCWS_final.csv',index=False)
df_NCAA.to_csv('Data/NCAA_final.csv',index=False)

In [38]:
df_RPI_NCAA = pd.merge(df_RPI, df_NCAA,  how='outer', on=['College','Year'])
df_RPI_NCAA

Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings
0,1.0,Oklahoma,Big 12,57-6-0,20-0-0,19-5-0,18-1-0,0-0-0,2019,0.984,7.30,0.905,14.0,1.40,0.30,1.83,0.639,0.25,0.435,0.97,6.50,0.344,1.60,28.0,
1,2.0,UCLA,Pac-12,56-6-0,14-1-0,19-1-0,23-4-0,0-0-0,2019,0.974,7.03,0.903,42.0,1.43,0.21,1.26,0.547,0.23,0.416,0.98,5.52,0.339,1.45,20.0,
2,3.0,Washington,Pac-12,52-9-0,16-1-0,18-5-0,18-3-0,0-0-0,2019,0.980,5.25,0.852,24.0,1.47,0.15,0.79,0.440,0.21,0.383,0.80,5.79,0.297,1.20,24.0,
3,4.0,Arizona,Pac-12,48-14-0,16-2-0,6-5-0,26-7-0,0-0-0,2019,0.976,6.48,0.774,22.0,1.61,0.11,1.77,0.570,0.29,0.398,0.23,3.71,0.318,1.18,15.0,
4,5.0,Florida State,ACC,55-10-0,14-6-0,9-0-0,32-4-0,0-0-0,2019,0.973,6.98,0.846,32.0,1.87,0.28,1.62,0.601,0.37,0.421,1.77,3.74,0.330,1.58,22.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088,,UNCW,,,,,,,2013,0.940,3.62,0.302,,4.21,0.08,0.28,0.320,0.19,,0.85,,0.236,1.17,,
2089,,UNCW,,,,,,,2014,0.955,3.37,0.190,,3.71,0.19,0.38,,0.19,,1.21,,0.240,0.98,,
2090,,UT Arlington,,,,,,,2013,0.953,2.49,0.472,,2.00,0.11,0.23,0.294,0.17,,0.40,,0.226,0.89,,
2091,,UT Arlington,,,,,,,2014,0.950,2.96,0.110,,3.89,0.11,0.56,,0.17,,0.57,,0.233,1.09,,


In [39]:
df = pd.merge(df_RPI_NCAA, df_WCWS,  how='outer', on=['College','Year'])
df = df.rename(columns = {'Rank':'WCWS_Rank'})
df

Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings,WCWS_Rank
0,1.0,Oklahoma,Big 12,57-6-0,20-0-0,19-5-0,18-1-0,0-0-0,2019,0.984,7.30,0.905,14.0,1.40,0.30,1.83,0.639,0.25,0.435,0.97,6.50,0.344,1.60,28.0,,2.0
1,2.0,UCLA,Pac-12,56-6-0,14-1-0,19-1-0,23-4-0,0-0-0,2019,0.974,7.03,0.903,42.0,1.43,0.21,1.26,0.547,0.23,0.416,0.98,5.52,0.339,1.45,20.0,,1.0
2,3.0,Washington,Pac-12,52-9-0,16-1-0,18-5-0,18-3-0,0-0-0,2019,0.980,5.25,0.852,24.0,1.47,0.15,0.79,0.440,0.21,0.383,0.80,5.79,0.297,1.20,24.0,,3.0
3,4.0,Arizona,Pac-12,48-14-0,16-2-0,6-5-0,26-7-0,0-0-0,2019,0.976,6.48,0.774,22.0,1.61,0.11,1.77,0.570,0.29,0.398,0.23,3.71,0.318,1.18,15.0,,5.0
4,5.0,Florida State,ACC,55-10-0,14-6-0,9-0-0,32-4-0,0-0-0,2019,0.973,6.98,0.846,32.0,1.87,0.28,1.62,0.601,0.37,0.421,1.77,3.74,0.330,1.58,22.0,,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,,UT Arlington,,,,,,,2014,0.950,2.96,0.110,,3.89,0.11,0.56,,0.17,,0.57,,0.233,1.09,,,
2093,,UT Arlington,,,,,,,2015,0.955,6.15,0.491,,5.10,0.11,0.75,0.464,0.49,0.414,0.38,,0.315,1.47,,,
2094,,Stanford,,,,,,,2019,,,,,,,,,,,,,,,,,33.0
2095,,Stanford,,,,,,,2013,,,,,,,,,,,,,,,,,17.0


In [40]:
%store df
df.to_csv('Data/ALL_final.csv',index=False)

Stored 'df' (DataFrame)


### Check missing values

In [41]:
# How many missing values per col?
missingCols = df.columns[df.isnull().any()]
print('The columns with missingness are: %s' % (list(missingCols)))

missingNs = [0] * len(list(missingCols))
for idx, col in enumerate(list(missingCols)):
    missingNs[idx] = df[col].isnull().sum()

data_missing = pd.DataFrame(list(missingCols))
data_missing.columns = ['ColumnName']
data_missing['MissingValues'] = missingNs
data_missing['PercMissing'] = np.round(np.array(missingNs)/df.shape[0]*100,2)
data_missing.sort_values(['PercMissing'],ascending=False)

The columns with missingness are: ['RPI_Ranking', 'Conference', 'Record', 'Road', 'Neutral', 'Home', 'Non-Div-I', 'Fielding_Percentage', 'Scoring', 'WL_Percentage', 'Hit_Batters', 'Earned_Run_Average', 'Triples_per_Game', 'Home_Runs_per_game', 'Slugging_Percentage', 'Double_Plays_per_Game', 'On_Base_Percentage', 'Stolen_Bases_per_Game', 'Strikeout-to-Walk_Ratio', 'Batting_Average', 'Doubles_per_Game', 'Shutouts', 'Team_Strikeouts_Per_Seven_Innings', 'WCWS_Rank']


Unnamed: 0,ColumnName,MissingValues,PercMissing
22,Team_Strikeouts_Per_Seven_Innings,2047,97.62
21,Shutouts,1814,86.5
18,Strikeout-to-Walk_Ratio,1802,85.93
10,Hit_Batters,1679,80.07
16,On_Base_Percentage,1652,78.78
23,WCWS_Rank,1649,78.64
14,Slugging_Percentage,296,14.12
12,Triples_per_Game,72,3.43
9,WL_Percentage,61,2.91
8,Scoring,57,2.72


In [42]:
print(len(df[df.RPI_Ranking.isnull()]))
df[df.RPI_Ranking.isnull()]

28


Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings,WCWS_Rank
2069,,Louisiana,,,,,,,2013,0.972,6.61,0.758,,2.39,0.11,1.42,0.528,0.16,,0.6,,0.318,1.1,,,
2070,,Louisiana,,,,,,,2014,0.959,6.37,0.13,,2.75,0.13,1.5,0.514,0.33,0.405,0.75,,0.302,1.17,,6.44,
2071,,Louisiana,,,,,,,2015,0.972,8.15,0.778,,2.38,0.19,2.15,0.652,0.3,0.452,1.19,,0.346,1.26,,,
2072,,Louisiana,,,,,,,2016,0.969,7.89,0.836,,2.06,0.13,1.96,0.622,0.16,0.446,0.95,,0.338,1.25,,,
2073,,NC State,,,,,,,2013,0.971,5.14,0.634,,2.35,0.04,1.57,0.492,0.3,,0.32,,0.266,1.25,,,17.0
2074,,NC State,,,,,,,2014,0.973,4.59,0.04,,3.03,0.04,1.24,0.468,0.35,,0.72,,0.275,1.06,,6.62,17.0
2075,,NC State,,,,,,,2015,0.963,5.33,0.633,,2.61,0.1,1.25,0.457,0.38,,0.62,,0.266,1.08,,,9.0
2076,,NC State,,,,,,,2016,0.957,5.02,0.411,,5.06,0.07,1.29,0.457,0.41,,0.77,,0.27,0.89,,,
2077,,NC State,,,,,,,2017,0.953,4.09,0.321,,5.93,0.14,0.7,0.395,0.36,,0.68,,0.27,0.91,,,
2078,,Purdue Fort Wayne,,,,,,,2013,0.966,4.54,0.68,,2.39,0.14,0.56,0.409,0.16,,1.54,,0.285,1.4,,,


In [43]:
df_ext = df.copy()
df_ext['WCWS_in'] = 1
df_ext['WCWS_in'][df_ext['WCWS_Rank'].isnull()] = 0
df_ext[df_ext.WCWS_in == 0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings,WCWS_Rank,WCWS_in
30,31.0,Statenford,Pac-12,33-20-0,6-6-0,9-3-0,18-11-0,0-0-0,2019,0.963,4.66,0.623,33.0,3.29,0.47,0.55,0.425,0.36,0.367,0.77,1.82,0.276,1.19,6.0,,,0
41,42.0,Oregon State,Pac-12,26-19-0,12-8-0,12-3-0,2-8-0,0-0-0,2019,0.943,4.07,0.578,20.0,3.53,0.11,0.60,0.381,0.33,0.333,1.16,2.31,0.258,1.18,9.0,,,0
42,43.0,Southern Ill.,MVC,34-15-0,9-3-0,9-5-0,16-7-0,0-0-0,2019,0.979,5.41,0.694,22.0,2.80,0.08,0.90,0.458,0.24,0.402,0.90,2.33,0.302,1.16,10.0,,,0
44,45.0,UNLV,MWC,36-14-0,10-5-0,6-3-0,20-6-0,0-0-0,2019,0.979,5.14,0.720,33.0,2.39,0.24,0.70,0.455,0.52,0.380,1.10,2.53,0.314,1.26,14.0,,,0
47,48.0,Iowa State,Big 12,37-25-0,8-8-0,17-9-0,12-8-0,0-0-0,2019,0.968,4.00,0.597,34.0,3.25,0.16,0.77,0.428,0.52,0.336,0.60,1.39,0.268,1.63,9.0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089,,UNCW,,,,,,,2013,0.940,3.62,0.302,,4.21,0.08,0.28,0.320,0.19,,0.85,,0.236,1.17,,,,0
2090,,UNCW,,,,,,,2014,0.955,3.37,0.190,,3.71,0.19,0.38,,0.19,,1.21,,0.240,0.98,,,,0
2091,,UT Arlington,,,,,,,2013,0.953,2.49,0.472,,2.00,0.11,0.23,0.294,0.17,,0.40,,0.226,0.89,,,,0
2092,,UT Arlington,,,,,,,2014,0.950,2.96,0.110,,3.89,0.11,0.56,,0.17,,0.57,,0.233,1.09,,,,0


In [48]:
College_names = set(df_ext.sort_values(by = ['College'], ascending = True).College)
College_names

{'A&M-Corpus Christi',
 'Abilene Christian',
 'Akron',
 'Alabama',
 'Alabama A&M',
 'Alabama State',
 'Albany (NY)',
 'Alcorn',
 'Appalachian State',
 'Arizona',
 'Arizona State',
 'Ark.-Pine Bluff',
 'Arkansas',
 'Army West Point',
 'Auburn',
 'Austin Peay',
 'BYU',
 'Ball State',
 'Baylor',
 'Belmont',
 'Bethune-Cookman',
 'Binghamton',
 'Boise State',
 'Boston College',
 'Boston U',
 'Bowling Green',
 'Bradley',
 'Brown',
 'Bryant',
 'Bucknell',
 'Buffalo',
 'Butler',
 'CSU Bakersfield',
 'CSU Fullerton',
 'CSUN',
 'Cal Poly',
 'California',
 'California Baptist',
 'Campbell',
 'Canisius',
 'Central Ark.',
 'Central Conn. State',
 'Central Mich.',
 'Charleston So.',
 'Charlotte',
 'Chattanooga',
 'Cleveland State',
 'Coastal Carolina',
 'Col. of Charleston',
 'Colgate',
 'Colorado State',
 'Columbia',
 'Coppin State',
 'Cornell',
 'Creighton',
 'Dartmouth',
 'Dayton',
 'DePaul',
 'Delaware',
 'Delaware State',
 'Detroit Mercy',
 'Drake',
 'Drexel',
 'Duke',
 'ETSU',
 'East Carolina'

In [54]:
Conference_names = set(df_ext.sort_values(by = ['Conference'], ascending = True).Conference)
Conference_names

{'AAC',
 'ACC',
 'ASUN',
 'America East',
 'Atlantic 10',
 'Atlantic Coast',
 'Atlantic Sun',
 'Big 12',
 'Big East',
 'Big Sky',
 'Big South',
 'Big Ten',
 'Big West',
 'C-USA',
 'CAA',
 'Colonial',
 'Conference USA',
 'Horizon',
 'Ivy',
 'Ivy League',
 'MAAC',
 'MAC',
 'MEAC',
 'MVC',
 'MWC',
 'Metro Atlantic',
 'Mid-American',
 'Mid-Eastern',
 'Missouri Valley',
 'Mountain West',
 'NEC',
 'Northeast',
 'OVC',
 'Pac-12',
 'Pacific Coast Softball',
 'Patriot',
 'SEC',
 'SWAC',
 'SoCon',
 'Southeastern',
 'Southern',
 'Southland',
 'Southwestern',
 'Summit',
 'Summit League',
 'Sun Belt',
 'WAC',
 'WCC',
 'West Coast',
 'Western Athletic',
 nan}

In [56]:
conference_names = pd.read_csv('Data/ConferenceNames.csv')
conference_names
for idx_row, conference in enumerate(df_ext.Conference):
    if len(conference_names[conference_names['Full'] == conference]) > 0:
        df_ext.loc[idx_row,'Conference'] = conference_names[conference_names['Full'] == conference]['Abbr'].iloc[0]

df_ext

KeyError: 'Team Name'

In [50]:
%store df_ext
df.to_csv('Data/Clean_final.csv',index=False)

Stored 'df_ext' (DataFrame)


### Test what are still missing

In [45]:
set(df[df.RPI_Ranking.isnull()].College)

{'Louisiana',
 'NC State',
 'Purdue Fort Wayne',
 'SFA',
 'Stanford',
 'ULL',
 'UNC Greensboro',
 'UNCW',
 'UT Arlington'}

In [30]:
college_nan = df_ext[df_ext.College.str.contains('WKU')]
print(len(college_nan))
college_nan

2


Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings,WCWS_Rank,WCWS_in
2096,,WKU,,,,,,,2015,,,,,,,,,,,,,,,,,17.0,1
2098,,WKU,,,,,,,2013,,,,,,,,,,,,,,,,,17.0,1


In [51]:
df_ext[df_ext.Conference.isnull() & df_ext.WCWS_in == 1]

Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings,WCWS_Rank,WCWS_in
2073,,NC State,,,,,,,2013,0.971,5.14,0.634,,2.35,0.04,1.57,0.492,0.3,,0.32,,0.266,1.25,,,17.0,1
2074,,NC State,,,,,,,2014,0.973,4.59,0.04,,3.03,0.04,1.24,0.468,0.35,,0.72,,0.275,1.06,,6.62,17.0,1
2075,,NC State,,,,,,,2015,0.963,5.33,0.633,,2.61,0.1,1.25,0.457,0.38,,0.62,,0.266,1.08,,,9.0,1
2088,,UNC Greensboro,,,,,,,2018,0.97,5.42,0.78,,2.07,0.07,0.98,0.447,0.36,,0.27,,0.273,1.42,,,49.0,1
2094,,Stanford,,,,,,,2019,,,,,,,,,,,,,,,,,33.0,1
2095,,Stanford,,,,,,,2013,,,,,,,,,,,,,,,,,17.0,1
2096,,ULL,,,,,,,2017,,,,,,,,,,,,,,,,,17.0,1


In [53]:
df_ext[df_ext.College == 'UNC Greensboro'] 

Unnamed: 0,RPI_Ranking,College,Conference,Record,Road,Neutral,Home,Non-Div-I,Year,Fielding_Percentage,Scoring,WL_Percentage,Hit_Batters,Earned_Run_Average,Triples_per_Game,Home_Runs_per_game,Slugging_Percentage,Double_Plays_per_Game,On_Base_Percentage,Stolen_Bases_per_Game,Strikeout-to-Walk_Ratio,Batting_Average,Doubles_per_Game,Shutouts,Team_Strikeouts_Per_Seven_Innings,WCWS_Rank,WCWS_in
72,73.0,UNC Greensboro,SoCon,33-23-0,12-6-0,8-6-0,13-11-0,0-0-0,2019,0.955,4.55,0.589,25.0,3.2,0.05,1.04,0.438,0.21,0.361,0.3,3.41,0.265,1.29,8.0,,,0
1611,131.0,UNC Greensboro,Southern,31-26-0,9-8-0,5-2-0,17-16-0,0-0-0,2014,0.955,5.81,0.12,,3.75,0.12,1.0,0.487,0.26,0.4,0.46,,0.312,1.51,,,,0
1880,104.0,UNC Greensboro,Southern,38-19,12-8,8- 1- 0,18-10,0-0,2013,0.958,5.09,0.667,,2.81,0.18,0.63,0.422,0.19,,1.05,,0.292,1.26,,,,0
2085,,UNC Greensboro,,,,,,,2015,0.956,5.66,0.532,,4.67,0.09,0.85,0.47,0.19,,0.7,,0.324,1.26,,,,0
2086,,UNC Greensboro,,,,,,,2016,0.96,5.25,0.559,,3.64,0.1,1.03,0.479,0.32,,0.37,,0.3,1.41,,,,0
2087,,UNC Greensboro,,,,,,,2017,0.958,4.67,0.564,,4.07,0.07,1.11,0.447,0.4,,0.51,,0.254,1.04,,,,0
2088,,UNC Greensboro,,,,,,,2018,0.97,5.42,0.78,,2.07,0.07,0.98,0.447,0.36,,0.27,,0.273,1.42,,,49.0,1


In [55]:
conference

nan