# 1. RUN THESE FIRST

In [1]:
from __future__ import print_function, division
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [2]:
year_list = [x for x in range(2016,1985,-1) if x != 1999]
print(year_list)

[2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987, 1986]


## (SKIP) Vote List

In [3]:
concat_list = []
for year in year_list:
    url = 'https://www.basketball-reference.com/allstar/NBA_{}_voting.html'
    response = requests.get(url.format(year))
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    text_all=''

    # scrapes html from website with player name and votes
    all = soup.find_all('div', id =re.compile('div_voting-results-'))

    # cleans data so player name and votes are separated by commas
    for i in range(2):
        text_all += all[i].text
        string = text_all.replace("\n"," ").replace("Frontcourt ","").replace("Backcourt ","").replace("    ","").strip()
        string = re.sub(r'\d+\.', '',string).strip()
        string = re.sub(r'\s+',' ',string)
        string = re.sub(r',','',string)
        string = re.sub(r'\s(\d*)\s', r',\1,',string)
        string = re.sub(r'([a-z])\s(\d)', r'\1,\2',string)

    #Takes out first comma, for 2012 and before
    string = re.sub(r'Guard |Center |Forward ', ',',string)
    if string[0] == ",":
        string = re.sub(r'Guard |Center |Forward ', ',',string)[1:]

    # makes a list with name and votes as elements
    all_list = string.split(",")

    # makes a list of list with name and vote grouped together. inserted year to first index
    new_list = []
    it = iter(all_list)
    for x in it:
        new_list.append([x, int(next(it))])
    for player_vote in new_list:
        player_vote.insert(0,year)

    df = pd.DataFrame(new_list, columns=['Year', 'Player','Votes'])
    concat_list.append(df)

df_concat = pd.concat(concat_list, axis=0)

In [5]:
# import pickle
# # SAVING AS A PICKLE
# filename = '2016_votes_pickle'
# outfile = open(filename,'wb')
# pickle.dump(df_concat,outfile)
# outfile.close()

# 2. PICKLING

In [6]:
# OPENING THE PICKLE FILE
import pickle
filename = '2016_votes_pickle'
infile = open(filename,'rb')
df_votes = pickle.load(infile)
infile.close()

In [7]:
df_votes.head()

Unnamed: 0,Year,Player,Votes
0,2016,LeBron James,1089206
1,2016,Paul George,711595
2,2016,Carmelo Anthony,567348
3,2016,Pau Gasol,566988
4,2016,Andre Drummond,515296


## STATS TIME (WARNING- LONG RUNTIME)

In [8]:
##################################
###     DO NOT RUN THIS CODE   ###
### IT WILL TAKE A LOT OF TIME ###
##################################


lists_of_stats = []
for year in year_list:
    url_player = 'https://www.basketball-reference.com/allstar/NBA_{}_voting.html'
    response_player = requests.get(url_player.format(year))
    page = response_player.text
    soup_player = BeautifulSoup(page, "lxml")
    list_links = []
    for j in range(2):
        for i in range(25):
            list_links.append(soup_player.find_all('div', id =re.compile('div_voting-results-'))[j].find_all('a')[i]['href'])
    for stats_link in list_links:
        url_stats_new = 'https://www.basketball-reference.com{}'
        response_stats_new = requests.get(url_stats_new.format(stats_link))
        page_stats_new = response_stats_new.text
        soup_stats_new = BeautifulSoup(page_stats_new, "lxml")
        pergame = soup_stats_new.find('tr', id =('per_game.'+ str(year)))
        pergame_diff = soup_stats_new.find('tr', id =('per_game.'+ str(year-1))) #added
        if pergame:
            find_pergame = pergame.find_all(
                lambda tag: tag.name == 'td' and tag['data-stat'])
            stat_list = []
            for stat in range(len(find_pergame)):
                stat_list.append(find_pergame[stat].text)
                
            if pergame_diff: # added
                find_pergame_diff = pergame_diff.find_all(
                lambda tag: tag.name == 'td' and tag['data-stat'])
                stat_list.append(find_pergame_diff[-1].text)
            else:
                stat_list.append("None")
                                 
            if pergame.find_all(class_ = "sr_star"):
                stat_list.append(1)
            else:
                stat_list.append(0)
        else:
            stat_list = [0]*30
#         stat_list.insert(0,f + " " + l)
        stat_list.insert(0,year)
        lists_of_stats.append(stat_list)

In [9]:
# import pickle
# filename2 = '2016_stats_pickle'
# outfile2 = open(filename2,'wb')
# pickle.dump(lists_of_stats,outfile2)
# outfile2.close()

# 3. IMPORT PICKLE FOR STATS TIME

In [10]:
# OPENING THE PICKLE FILE
import pickle
filename2 = '2016_stats_pickle'
infile = open(filename2,'rb')
stats_list = pickle.load(infile)
infile.close()

In [11]:
df_stats = pd.DataFrame(stats_list, columns=['Year', 'Age','Team','League','Pos','GP','GS', 'MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS','Prev_PTS','Star',])

In [12]:
df_votes_r = df_votes.reset_index()

In [13]:
new_index = [x for x in range(50)]*30
df_stats_r = df_stats.reset_index()
df_stats_r['index'] = new_index

In [14]:
df_merge_2016 = df_votes_r.merge(df_stats_r, how='left',on=['index','Year'])
df_merge_2016 = df_merge_2016.drop(columns = 'index')
df_merge_2016.head()

Unnamed: 0,Year,Player,Votes,Age,Team,League,Pos,GP,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Prev_PTS,Star
0,2016,LeBron James,1089206,31,CLE,NBA,SF,76,76,35.6,...,6.0,7.4,6.8,1.4,0.6,3.3,1.9,25.3,25.3,1.0
1,2016,Paul George,711595,25,IND,NBA,SF,81,81,34.8,...,6.0,7.0,4.1,1.9,0.4,3.3,2.8,23.1,8.8,1.0
2,2016,Carmelo Anthony,567348,31,NYK,NBA,SF,72,72,35.1,...,6.4,7.7,4.2,0.9,0.5,2.4,2.5,21.8,24.2,1.0
3,2016,Pau Gasol,566988,35,CHI,NBA,C,72,72,31.8,...,8.9,11.0,4.1,0.6,2.0,2.3,2.1,16.5,18.5,1.0
4,2016,Andre Drummond,515296,22,DET,NBA,C,81,81,32.9,...,9.9,14.8,0.8,1.5,1.4,1.9,3.0,16.2,13.8,1.0


In [16]:
# import pickle
# filename3 = 'df_2016'
# outfile3 = open(filename3,'wb')
# pickle.dump(df_merge_2016,outfile3)
# outfile3.close()