## Begin with preparing data

In [1]:
#Importing packages
import datetime
import pandas as pd
import MySQLdb as mdb
import numpy as np

In [2]:
#Lets fetch the data from MySQL database
con = mdb.connect(host = 'localhost', 
                  user = 'root',
                  database = 'Project',
                  passwd = '<password>', 
                  charset='utf8', use_unicode=True)
    
query_template = '''select *
                    from data_mining.nets_tickets
                    '''
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute(query_template, )
data = cur.fetchall()
cur.close()
con.close()
nets_data = pd.DataFrame(list(data))

In [3]:
nets_data['invoice_date'] = [d.date() for d in nets_data['invoice_date']]

In [4]:
nets_data['event_date_calc'] = [d.date() for d in nets_data['event_date']]

In [5]:
col_of_interest = ['begseat', 'channel','endseat', 'event_date', 'instance', 'invoice_date', 'opponent',
                  'pc', 'quantity', 'revenue', 'row', 'section', 'sold', 'ticket_type', 'event_date_calc']

In [6]:
#Hot fix the LA Clipper name situation 
name_change = {'LA Clippers': 'Los Angeles Clippers'}
nets_data = nets_data.replace({'opponent': name_change})

In [7]:
nets_data = nets_data[col_of_interest]
nets_data['opponent'] = nets_data['opponent'].str.lower()

#Date related changes
nets_data['end_year'] = nets_data['event_date'] + datetime.timedelta(days=183) #Add to the nearest year
nets_data['end_year'] = nets_data['end_year'].dt.year #this is purely for merging, will get rid of later
nets_data['year'] = nets_data['event_date'].dt.year
nets_data['month'] = nets_data['event_date'].dt.month
nets_data['dayofweek'] = nets_data['event_date'].dt.dayofweek
nets_data['time_left'] = nets_data['event_date_calc'] - nets_data['invoice_date']
nets_data['time_left'] = [a.days for a in nets_data['time_left']]

#Name changes
#nets_data['opponent'] = nets_data['opponent'].str.replace('la clippers','los angeles clippers.')

#Basic data engineering
nets_data['price'] = nets_data['revenue']/nets_data['quantity']
nets_data = nets_data.drop(columns = ['revenue', 'invoice_date']) #Lets keep quantity

nets_data.head(3)

Unnamed: 0,begseat,channel,endseat,event_date,instance,opponent,pc,quantity,row,section,sold,ticket_type,event_date_calc,end_year,year,month,dayofweek,time_left,price
0,1,StubHub,2,2017-02-01,01/02/2017_1/15/1.0/2.0,new york knicks,R,2,15,1,0,Seasons,2017-02-01,2017,2017,2,2,1,83.64
1,12,StubHub,13,2017-02-01,01/02/2017_1/15/12.0/13.0,new york knicks,R,2,15,1,1,Seasons,2017-02-01,2017,2017,2,2,0,75.48
2,14,Ticketmaster,15,2017-02-01,01/02/2017_1/15/14.0/15.0,new york knicks,R,2,15,1,0,Seasons,2017-02-01,2017,2017,2,2,0,77.22


In [8]:
nets_data.shape

(62990, 19)

## Lets start by working on each game data

We best way to approach this is to combine the starting five columns into 1 column with a list. Then we use this list to check off our conditional data like `"How many allstars are in this game?"` or `"How far did this team go in the playoffs last year?"`. Then we can one-hot encode this list into dummy variables. 

In [9]:
#Lets fetch the data from MySQL database
con = mdb.connect(host = 'localhost', 
                  user = 'root',
                  database = 'Project',
                  passwd = '<password>', 
                  charset='utf8', use_unicode=True)
    
query_template = '''select *
                    from data_mining.each_game_data
                    '''
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute(query_template, )
data = cur.fetchall()
cur.close()
con.close()
each_game = pd.DataFrame(list(data))

In [10]:
each_game.head(3)

Unnamed: 0,event_date,nets_1,nets_2,nets_3,nets_4,nets_5,nets_losses,nets_wins,opp_1,opp_2,opp_3,opp_4,opp_5,opp_losses,opp_wins,opponent
0,2016-10-28,Jeremy Lin,Bojan Bogdanovic,Trevor Booker,Brook Lopez,Rondae Hollis-Jefferson,1,1,Paul George,Thaddeus Young,Monta Ellis,Myles Turner,Jeff Teague,1,1,Indiana Pacers
1,2016-10-31,Rondae Hollis-Jefferson,Jeremy Lin,Brook Lopez,Bojan Bogdanovic,Trevor Booker,3,1,Dwyane Wade,Jimmy Butler,Rajon Rondo,Taj Gibson,Robin Lopez,0,3,Chicago Bulls
2,2016-11-02,Bojan Bogdanovic,Trevor Booker,Brook Lopez,Rondae Hollis-Jefferson,Jeremy Lin,3,2,Kentavious Caldwell-Pope,Tobias Harris,Marcus Morris,Ish Smith,Andre Drummond,2,3,Detroit Pistons


In [11]:
each_game['starting_five'] = each_game[['nets_1', 'nets_2', 'nets_3', 'nets_4', 'nets_5', 'opp_1', 'opp_2',
                           'opp_3', 'opp_4', 'opp_5']].values.tolist()

In [12]:
each_game = each_game.drop(each_game.columns[[1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 15]], axis=1) 
#lets drop opponents as well

In [13]:
each_game.head(3)

Unnamed: 0,event_date,nets_losses,nets_wins,opp_losses,opp_wins,starting_five
0,2016-10-28,1,1,1,1,"[Jeremy Lin, Bojan Bogdanovic, Trevor Booker, ..."
1,2016-10-31,3,1,0,3,"[Rondae Hollis-Jefferson, Jeremy Lin, Brook Lo..."
2,2016-11-02,3,2,2,3,"[Bojan Bogdanovic, Trevor Booker, Brook Lopez,..."


## Let's merge this with the core nets_data dataframe

In [14]:
nets_data = pd.merge(nets_data, each_game, on='event_date')

In [15]:
nets_data.head(1)

Unnamed: 0,begseat,channel,endseat,event_date,instance,opponent,pc,quantity,row,section,...,year,month,dayofweek,time_left,price,nets_losses,nets_wins,opp_losses,opp_wins,starting_five
0,1,StubHub,2,2017-02-01,01/02/2017_1/15/1.0/2.0,new york knicks,R,2,15,1,...,2017,2,2,1,83.64,40,9,29,22,"[Brook Lopez, Rondae Hollis-Jefferson, Bojan B..."


In [16]:
nets_data.shape #leave it, we are ok with losing some tickets if it has no matches to game history

(58418, 24)

## Let's do allstars now

In [17]:
#Lets fetch the data from MySQL database
con = mdb.connect(host = 'localhost', 
                  user = 'root',
                  database = 'Project',
                  passwd = '<password>', 
                  charset='utf8', use_unicode=True)
    
query_template = '''select *
                    from data_mining.allstars_data
                    '''
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute(query_template, )
data = cur.fetchall()
cur.close()
con.close()
allstars = pd.DataFrame(list(data))

In [18]:
#Not ideal solution, but for the interest of time, lets create a dataframe for merging.
dict_allstars = [{'Players': list(set(allstars['player'].loc[allstars['year'] < 2017].tolist())), 'year':2017},
                {'Players': list(set(allstars['player'].loc[allstars['year'] < 2016].tolist())), 'year':2016},
                {'Players': list(set(allstars['player'].loc[allstars['year'] < 2015].tolist())), 'year':2015}]

In [19]:
new_allstars = pd.DataFrame(dict_allstars)
new_allstars = new_allstars.rename(columns={'year': 'end_year'})
new_allstars.head(3)

Unnamed: 0,Players,end_year
0,"[LeBron James, Roy Hibbert, Kyrie Irving, Joak...",2017
1,"[LeBron James, Roy Hibbert, Kyrie Irving, Joak...",2016
2,"[LeBron James, Roy Hibbert, Kyrie Irving, Joak...",2015


In [20]:
nets_data = pd.merge(nets_data, new_allstars, on='end_year')

In [21]:
nets_data.tail(3)

Unnamed: 0,begseat,channel,endseat,event_date,instance,opponent,pc,quantity,row,section,...,month,dayofweek,time_left,price,nets_losses,nets_wins,opp_losses,opp_wins,starting_five,Players
58415,18,Ticketmaster,18,2016-10-31,31/10/2016_9/9/18.0/18.0,chicago bulls,F,1,9,9,...,10,0,35,257.42,3,1,0,3,"[Rondae Hollis-Jefferson, Jeremy Lin, Brook Lo...","[LeBron James, Roy Hibbert, Kyrie Irving, Joak..."
58416,5,StubHub,6,2016-10-31,31/10/2016_9/9/5.0/6.0,chicago bulls,F,2,9,9,...,10,0,0,63.7,3,1,0,3,"[Rondae Hollis-Jefferson, Jeremy Lin, Brook Lo...","[LeBron James, Roy Hibbert, Kyrie Irving, Joak..."
58417,9,StubHub,11,2016-10-31,31/10/2016_9/9/9.0/11.0,chicago bulls,F,3,9,9,...,10,0,0,154.84,3,1,0,3,"[Rondae Hollis-Jefferson, Jeremy Lin, Brook Lo...","[LeBron James, Roy Hibbert, Kyrie Irving, Joak..."


In [22]:
nets_data.shape

(58418, 25)

In [23]:
#Very ugly... but do not want to dwell on this due to deadline. 
allstar_list = []
for i in range(len(nets_data)):
    count = len(set(set(nets_data['Players'][i]))
        -(set(nets_data['Players'][i]) 
          - set(nets_data['starting_five'][i])))
    allstar_list.append(count)

In [24]:
allstar_series = pd.Series(allstar_list)

In [25]:
nets_data['allstars_count'] = allstar_series.values

In [26]:
#Now we can drop the all stars column, we don't actually need the player names
nets_data = nets_data.drop(columns = ['Players'])

In [27]:
nets_data.shape

(58418, 25)

## Add playoff history for the past few years

In [28]:
#Lets fetch the data from MySQL database
con = mdb.connect(host = 'localhost', 
                  user = 'root',
                  database = 'Project',
                  passwd = '<password>', 
                  charset='utf8', use_unicode=True)
    
query_template = '''select *
                    from data_mining.playoff_data
                    '''
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute(query_template, )
data = cur.fetchall()
cur.close()
con.close()
playoff_data = pd.DataFrame(list(data))

In [29]:
playoff_data.head(10)

Unnamed: 0,rounds,team,year
0,Eastern Conference First Round,Atlanta Hawks,2014
1,Eastern Conference First Round,Brooklyn Nets,2014
2,Eastern Conference Semifinals,Brooklyn Nets,2014
3,Eastern Conference First Round,Charlotte Bobcats,2014
4,Eastern Conference First Round,Chicago Bulls,2014
5,Western Conference First Round,Dallas Mavericks,2014
6,Western Conference First Round,Golden State Warriors,2014
7,Western Conference First Round,Houston Rockets,2014
8,Eastern Conference Finals,Indiana Pacers,2014
9,Eastern Conference First Round,Indiana Pacers,2014


In [30]:
mapping = {'Eastern Conference First Round': 1, 'Western Conference First Round': 1, 
           'Eastern Conference Semifinals': 2,'Western Conference Semifinals': 2,
          'Eastern Conference Finals': 3, 'Western Conference Finals':3,
          'Finals': 4, 'Champion':5 }

playoff_data = playoff_data.replace({'rounds': mapping})

In [31]:
playoff_data['year'] = playoff_data['year'] + 1 #Cutting corners here, matching previous year with current year later
playoff_data.rename(columns={'team': 'opponent', 'year': 'end_year'}, inplace=True)
playoff_data['opponent'] = playoff_data['opponent'].str.lower()

In [32]:
playoff_data = playoff_data.groupby(['opponent', 'end_year'], sort=False)['rounds'].max()
playoff_data = playoff_data.reset_index()

In [33]:
playoff_data.shape

(64, 3)

In [34]:
nets_data = pd.merge(nets_data, playoff_data, on=['opponent', 'end_year'], how = 'left')

In [35]:
nets_data['rounds'].fillna(0, inplace=True)

In [36]:
nets_data.shape

(58418, 26)

## Last, lets get the conference data. 

In [37]:
#Lets fetch the data from MySQL database
con = mdb.connect(host = 'localhost', 
                  user = 'root',
                  database = 'Project',
                  passwd = '<password>', 
                  charset='utf8', use_unicode=True)
    
query_template = '''select *
                    from data_mining.conference_data
                    '''
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute(query_template, )
data = cur.fetchall()
cur.close()
con.close()
conference_data = pd.DataFrame(list(data))

In [38]:
conference_data.head(3)

Unnamed: 0,conference,losses,oppg,ppg,seed,team,winloss,wins,year
0,East,44,101.5,101.0,8,Atlanta Hawks,0.463,38,2014
1,East,57,100.7,96.2,12,Boston Celtics,0.305,25,2014
2,East,38,99.5,98.5,5,Brooklyn Nets,0.537,44,2014


In [39]:
conference_data.rename(columns={'team':'opponent','year': 'end_year'}, inplace=True)
conference_data['opponent'] = conference_data['opponent'].str.lower()

In [40]:
conference_data['end_year'] = conference_data['end_year'] + 1 #same trick, end year to make it easier to match

In [41]:
conference_data = conference_data[conference_data['opponent']!= 'brooklyn nets']

In [42]:
conference_data.head()

Unnamed: 0,conference,losses,oppg,ppg,seed,opponent,winloss,wins,end_year
0,East,44,101.5,101.0,8,atlanta hawks,0.463,38,2015
1,East,57,100.7,96.2,12,boston celtics,0.305,25,2015
3,East,39,97.1,96.9,7,charlotte bobcats,0.524,43,2015
4,East,34,91.8,93.7,3,chicago bulls,0.585,48,2015
5,East,49,101.5,98.2,10,cleveland cavaliers,0.402,33,2015


In [43]:
nets_data_2 = pd.merge(nets_data, conference_data, on=['opponent', 'end_year'], how = 'left')

In [44]:
nets_data_2.shape

(58418, 33)

## Let's convert the starting fives into dummy variables

In [45]:
#https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list/45312840
def pir_fast(df):
    v = df.starting_five.values
    l = [len(x) for x in v.tolist()]
    f, u = pd.factorize(np.concatenate(v))
    n, m = len(v), u.size
    i = np.arange(n).repeat(l)

    dummies = pd.DataFrame(
        np.bincount(i * m + f, minlength=n * m).reshape(n, m),
        df.index, u
    )

    return df.drop('starting_five', 1).join(dummies)

In [46]:
nets_data_test = pir_fast(nets_data_2)

In [47]:
nets_data_test.shape

(58418, 207)

In [50]:
nets_data_test.head()

Unnamed: 0,begseat,channel,endseat,event_date,instance,opponent,pc,quantity,row,section,...,Dario Saric,Timothe Luwawu-Cabarrot,Richaun Holmes,Monta Ellis,DeAndre Jordan,Chris Paul,J.J. Redick,Paul Pierce,Luc Mbah a Moute,Rajon Rondo
0,1,StubHub,2,2017-02-01,01/02/2017_1/15/1.0/2.0,new york knicks,R,2,15,1,...,0,0,0,0,0,0,0,0,0,0
1,12,StubHub,13,2017-02-01,01/02/2017_1/15/12.0/13.0,new york knicks,R,2,15,1,...,0,0,0,0,0,0,0,0,0,0
2,14,Ticketmaster,15,2017-02-01,01/02/2017_1/15/14.0/15.0,new york knicks,R,2,15,1,...,0,0,0,0,0,0,0,0,0,0
3,3,Ticket Network,6,2017-02-01,01/02/2017_1/15/3.0/6.0,new york knicks,R,4,15,1,...,0,0,0,0,0,0,0,0,0,0
4,1,StubHub,2,2017-02-01,01/02/2017_1/16/1.0/2.0,new york knicks,S,2,16,1,...,0,0,0,0,0,0,0,0,0,0


In [51]:
opponent = nets_data_test

In [52]:
opponent.to_csv('opponent.csv', sep=',', encoding='utf-8', index=False)

## Last touches
We should get rid of end_year and event_date (the information is captured through months, days of the week and year which also have a more ordinal relationship). We can also get rid of opponent, since their information is captured throught the other variables (the team name itself has no information). Lastly, also get rid of channel and ticket_type, our business goal is only on regular season games. 

In [52]:
nets_data_test = nets_data_test.drop(columns = ['event_date','end_year', 'event_date_calc', 
                                                'opponent', 'channel', 'ticket_type'])

In [53]:
nets_data_test = nets_data_test.rename(columns={'pc': 'location'})

We need to convert Conference into binary (East/West), location into dummies, and leave year as integer (the magnitude can be taken care of in regularization).

In [54]:
mapping2 = {'West': 1, 'East': 2}

In [55]:
nets_data_test = nets_data_test.replace({'conference': mapping2})

In [56]:
# Create a set of dummy variables from the location variable
df_location = pd.get_dummies(nets_data_test['location'])
# Join the dummy variables to the main dataframe
nets_data_test = pd.concat([nets_data_test, df_location], axis=1)
nets_data_test.head()

Unnamed: 0,begseat,endseat,instance,location,quantity,row,section,sold,year,month,...,Q,R,S,T,U,V,W,X,Y,Z
0,1,2,01/02/2017_1/15/1.0/2.0,R,2,15,1,0,2017,2,...,0,1,0,0,0,0,0,0,0,0
1,12,13,01/02/2017_1/15/12.0/13.0,R,2,15,1,1,2017,2,...,0,1,0,0,0,0,0,0,0,0
2,14,15,01/02/2017_1/15/14.0/15.0,R,2,15,1,0,2017,2,...,0,1,0,0,0,0,0,0,0,0
3,3,6,01/02/2017_1/15/3.0/6.0,R,4,15,1,1,2017,2,...,0,1,0,0,0,0,0,0,0,0
4,1,2,01/02/2017_1/16/1.0/2.0,S,2,16,1,0,2017,2,...,0,0,1,0,0,0,0,0,0,0


In [57]:
# Create a set of dummy variables from the row variable
df_row = pd.get_dummies(nets_data_test['row'])
# Join the dummy variables to the main dataframe
nets_data_test = pd.concat([nets_data_test, df_row], axis=1)
nets_data_test.head()

Unnamed: 0,begseat,endseat,instance,location,quantity,row,section,sold,year,month,...,5,6,7,7W,8,9,B,C,D,E
0,1,2,01/02/2017_1/15/1.0/2.0,R,2,15,1,0,2017,2,...,0,0,0,0,0,0,0,0,0,0
1,12,13,01/02/2017_1/15/12.0/13.0,R,2,15,1,1,2017,2,...,0,0,0,0,0,0,0,0,0,0
2,14,15,01/02/2017_1/15/14.0/15.0,R,2,15,1,0,2017,2,...,0,0,0,0,0,0,0,0,0,0
3,3,6,01/02/2017_1/15/3.0/6.0,R,4,15,1,1,2017,2,...,0,0,0,0,0,0,0,0,0,0
4,1,2,01/02/2017_1/16/1.0/2.0,S,2,16,1,0,2017,2,...,0,0,0,0,0,0,0,0,0,0


In [58]:
# Create a set of dummy variables from the section variable
df_section = pd.get_dummies(nets_data_test['section'])
# Join the dummy variables to the main dataframe
nets_data_test = pd.concat([nets_data_test, df_section], axis=1)
nets_data_test.head()

Unnamed: 0,begseat,endseat,instance,location,quantity,row,section,sold,year,month,...,26,28,29,3,31,4,6,7,8,9
0,1,2,01/02/2017_1/15/1.0/2.0,R,2,15,1,0,2017,2,...,0,0,0,0,0,0,0,0,0,0
1,12,13,01/02/2017_1/15/12.0/13.0,R,2,15,1,1,2017,2,...,0,0,0,0,0,0,0,0,0,0
2,14,15,01/02/2017_1/15/14.0/15.0,R,2,15,1,0,2017,2,...,0,0,0,0,0,0,0,0,0,0
3,3,6,01/02/2017_1/15/3.0/6.0,R,4,15,1,1,2017,2,...,0,0,0,0,0,0,0,0,0,0
4,1,2,01/02/2017_1/16/1.0/2.0,S,2,16,1,0,2017,2,...,0,0,0,0,0,0,0,0,0,0


In [59]:
nets_data_test = nets_data_test.drop(columns = ['location', 'row', 'section'])

In [61]:
nets_data_test.to_csv('nets_cleaned_data.csv', sep=',', encoding='utf-8', index=False)