In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
def print_results_regression(y_true, predictions):
    mse = mean_squared_error(y_true, predictions)
    mae = mean_absolute_error(y_true, predictions)
    r2 = r2_score(y_true, predictions) * 100
    print("MSE: %.4f" % mse)
    print("MAE: %.4f" % mae)
    print("R2: %.2f %%" % r2)

    
def print_results_cv(model_obj):
    print("Best parameters set found on development set:")
    print('')
    print(model_obj.best_params_)
    print('')
    
    
    print("Grid scores on development set:")
    print('')
    means = model_obj.cv_results_['mean_test_score']
    stds = model_obj.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, model_obj.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
    print('')

    print("Detailed classification report:")
    print('')
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print('')
    
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report,log_loss
def print_results_classification(y_true, predictions, threshold=0.5):
    auc = roc_auc_score(y_true, predictions)
    acc = accuracy_score(y_true, predictions>threshold)
    log=log_loss(y_true,predictions)
    print("AUC: %.2f %%" % auc)
    print("Accuracy: %.2f %%" % acc)
    print("Log Loss: %.2f %%" % log)
    print(classification_report(y_true, predictions>threshold))

In [4]:
dfx=pd.read_csv('data.csv', sep=',')

In [5]:
dfx=dfx[['shot_id']]

In [6]:
df=pd.read_csv('data.csv', sep=',')

In [7]:
df

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6
6,Layup Shot,Layup,251,20000012,34.0443,0,0,-118.2698,8,3,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,7
7,Jump Shot,Jump Shot,254,20000012,34.0163,1,28,-118.2688,8,3,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,8
8,Jump Shot,Jump Shot,265,20000012,33.9363,-65,108,-118.3348,6,3,...,2PT Field Goal,Left Side(L),In The Paint (Non-RA),8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,9
9,Running Jump Shot,Jump Shot,294,20000012,33.9193,-33,125,-118.3028,3,3,...,2PT Field Goal,Center(C),In The Paint (Non-RA),8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,10


In [8]:
df.dtypes

action_type            object
combined_shot_type     object
game_event_id           int64
game_id                 int64
lat                   float64
loc_x                   int64
loc_y                   int64
lon                   float64
minutes_remaining       int64
period                  int64
playoffs                int64
season                 object
seconds_remaining       int64
shot_distance           int64
shot_made_flag        float64
shot_type              object
shot_zone_area         object
shot_zone_basic        object
shot_zone_range        object
team_id                 int64
team_name              object
game_date              object
matchup                object
opponent               object
shot_id                 int64
dtype: object

In [9]:
d=df.isnull().sum()
e=df.count()
f=pd.concat([d,e], axis=1)
f['%missing']=(100*(f[0]/f[1])).astype(int)
f.sort_values("%missing",ascending=False)

Unnamed: 0,0,1,%missing
shot_made_flag,5000,25697,19
action_type,0,30697,0
shot_distance,0,30697,0
opponent,0,30697,0
matchup,0,30697,0
game_date,0,30697,0
team_name,0,30697,0
team_id,0,30697,0
shot_zone_range,0,30697,0
shot_zone_basic,0,30697,0


In [10]:
df.drop(['shot_id','game_event_id','game_id','team_name','team_id','game_date','matchup'],inplace=True, axis=1)

In [11]:
d1 = pd.crosstab(index=df["playoffs"], 
                          columns=df["shot_made_flag"])

d1['%goal']=(100*d1[1]/(d1[1]+d1[0])).astype(int)
d1.sort_values('%goal',ascending=False)

shot_made_flag,0.0,1.0,%goal
playoffs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,12145,9794,44
1,2087,1671,44


In [12]:
df.dtypes

action_type            object
combined_shot_type     object
lat                   float64
loc_x                   int64
loc_y                   int64
lon                   float64
minutes_remaining       int64
period                  int64
playoffs                int64
season                 object
seconds_remaining       int64
shot_distance           int64
shot_made_flag        float64
shot_type              object
shot_zone_area         object
shot_zone_basic        object
shot_zone_range        object
opponent               object
dtype: object

In [13]:
df_num=df[['shot_made_flag','loc_x','loc_y','lat','lon','minutes_remaining','period','playoffs','seconds_remaining','shot_distance']]

In [14]:
df_cat=df[['action_type','combined_shot_type','season','shot_type','shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'opponent']]
df_cat_dum= pd.get_dummies(df_cat)

In [15]:
dfim=pd.concat([df_num,df_cat_dum],axis=1)

In [16]:
d=dfim[dfim.shot_made_flag.notnull()]

In [17]:
d.shape

(25697, 146)

In [18]:
#next step is to standardize our data - using MinMaxScaler
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(d)
d = pd.DataFrame(scaler.transform(d), index=d.index, columns=d.columns)
d.iloc[4:10]

Unnamed: 0,shot_made_flag,loc_x,loc_y,lat,lon,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
5,0.0,0.210843,0.039521,0.960479,0.210843,0.818182,0.333333,0.0,0.542373,0.177215,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.502008,0.052695,0.947305,0.502008,0.727273,0.333333,0.0,0.881356,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.371486,0.182036,0.817964,0.371486,0.545455,0.333333,0.0,0.20339,0.151899,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.435743,0.202395,0.797605,0.435743,0.272727,0.333333,0.0,0.610169,0.151899,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.313253,0.337725,0.662275,0.313253,0.090909,0.333333,0.0,0.949153,0.316456,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1.0,0.74498,0.20479,0.79521,0.74498,1.0,0.0,0.0,0.0,0.21519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [20]:
X=d.drop('shot_made_flag', axis=1)
y=d.shot_made_flag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [359]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'n_estimators': [100,150],
                     'max_depth' : [20,30],
                     'min_samples_split': [10,20]}]

m3 = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='neg_log_loss')
m3.fit(X_train, y_train)
pred_train = m3.predict_proba(X_train)
pred_test = m3.predict_proba(X_test)

print_results_cv(m3)

score_train = log_loss(y_train, pred_train)
score_test = log_loss(y_test, pred_test)

print(score_train)
print(score_test)

#print_results_classification(y_train, pred_train)
#print_results_classification(y_test, pred_test)

Best parameters set found on development set:

{'max_depth': 20, 'min_samples_split': 20, 'n_estimators': 150}

Grid scores on development set:

-0.608 (+/-0.008) for {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
-0.608 (+/-0.009) for {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 150}
-0.608 (+/-0.008) for {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
-0.608 (+/-0.007) for {'max_depth': 20, 'min_samples_split': 20, 'n_estimators': 100}
-0.607 (+/-0.006) for {'max_depth': 20, 'min_samples_split': 20, 'n_estimators': 150}
-0.607 (+/-0.006) for {'max_depth': 20, 'min_samples_split': 20, 'n_estimators': 200}
-0.611 (+/-0.008) for {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 100}
-0.611 (+/-0.007) for {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 150}
-0.611 (+/-0.008) for {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 200}
-0.610 (+/-0.008) for {'max_depth': 30, 'min_samples_split': 20, 'n_estimators': 

In [21]:
from sklearn.ensemble import RandomForestClassifier

m4 = RandomForestClassifier(max_depth=20,min_samples_split=20,n_estimators=150)
m4.fit(X_train, y_train)

pred_train = m4.predict_proba(X_train)
pred_test = m4.predict_proba(X_test)

score_train = log_loss(y_train, pred_train)
score_test = log_loss(y_test, pred_test)

print(score_train)
print(score_test)
#print_results_classification(y_train, pred_train)
#print_results_classification(y_test, pred_test)

0.5188189388378135
0.609815111750984


In [22]:
estimator = m4.estimators_[4]
estimator

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False,
            random_state=686327219, splitter='best')

In [25]:
from sklearn import tree
import collections
import pydotplus

In [26]:
# Visualize data
dot_data = tree.export_graphviz(estimator,
                                feature_names=X.columns,
                                out_file=None,
                                filled=True,
                                rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])



In [361]:
feat_labels =X.columns

In [362]:
t=[]
for feature in zip(feat_labels, 100*m4.feature_importances_):
    t.append(feature)

In [387]:
ft=(pd.DataFrame(t, columns=['feature','importance'])).sort_values("importance",ascending=False)
ft.head(18)

Unnamed: 0,feature,importance
35,action_type_Jump Shot,10.332347
7,seconds_remaining,5.940133
1,loc_y,5.583861
2,lat,5.337471
3,lon,5.212772
0,loc_x,4.942025
8,shot_distance,4.37106
4,minutes_remaining,3.80858
36,action_type_Layup Shot,3.356162
67,combined_shot_type_Dunk,2.504722


In [395]:
ft1=(ft.head(18))
var=ft1.feature.unique()
var

array(['action_type_Jump Shot', 'seconds_remaining', 'loc_y', 'lat',
       'lon', 'loc_x', 'shot_distance', 'minutes_remaining',
       'action_type_Layup Shot', 'combined_shot_type_Dunk', 'period',
       'action_type_Running Jump Shot', 'action_type_Driving Layup Shot',
       'combined_shot_type_Jump Shot', 'shot_zone_basic_Restricted Area',
       'action_type_Pullup Jump shot', 'opponent_HOU', 'playoffs'],
      dtype=object)

In [396]:
X=d[var]
y=d.shot_made_flag
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [404]:
for i in var:
    print(i)
    print(X_train.groupby(y_train)[i].describe())
    print("")
    print("")

action_type_Jump Shot
                 count      mean       std  min  25%  50%  75%  max
shot_made_flag                                                     
0.0             9928.0  0.751914  0.431924  0.0  1.0  1.0  1.0  1.0
1.0             8059.0  0.448939  0.497417  0.0  0.0  0.0  1.0  1.0


seconds_remaining
                 count      mean       std  min       25%       50%       75%  \
shot_made_flag                                                                  
0.0             9928.0  0.471092  0.298308  0.0  0.203390  0.474576  0.728814   
1.0             8059.0  0.490386  0.294178  0.0  0.237288  0.491525  0.745763   

                max  
shot_made_flag       
0.0             1.0  
1.0             1.0  


loc_y
                 count      mean       std       min       25%       50%  \
shot_made_flag                                                             
0.0             9928.0  0.176140  0.110865  0.000000  0.071557  0.164072   
1.0             8059.0  0.143893  0.0

### Boosting

In [398]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [399]:
tuned_parameters = [{'n_estimators': [200],
                     'max_depth' : [3, 5, 10],
                     'min_samples_split': [10, 20, 50],
                     'learning_rate': [0.1], 
                     'subsample': [0.5]}]

m5 = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='neg_log_loss')
m5.fit(X_train, y_train)
pred_train = m5.predict_proba(X_train)
pred_test = m5.predict_proba(X_test)

print_results_cv(m5)

score_train = log_loss(y_train, pred_train)
score_test = log_loss(y_test, pred_test)

print(score_train)
print(score_test)

#print_results_classification(y_train, pred_train)
#print_results_classification(y_test, pred_test)

Best parameters set found on development set:

{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 50, 'n_estimators': 200, 'subsample': 0.5}

Grid scores on development set:

-0.615 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.5}
-0.615 (+/-0.010) for {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 20, 'n_estimators': 200, 'subsample': 0.5}
-0.615 (+/-0.009) for {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 50, 'n_estimators': 200, 'subsample': 0.5}
-0.631 (+/-0.011) for {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.5}
-0.630 (+/-0.008) for {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 20, 'n_estimators': 200, 'subsample': 0.5}
-0.627 (+/-0.007) for {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 50, 'n_estimators': 200, 'subsample': 0.5}
-0.739 (+/-0.017) for {'learning_rate': 0.1, 'max_depth': 10, 'min

In [367]:
dfim.shape

(30697, 146)

In [368]:
dff=pd.merge(left=dfx,right=dfim,how='left', left_index=True, right_index=True)

In [369]:
dff

Unnamed: 0,shot_id,shot_made_flag,loc_x,loc_y,lat,lon,minutes_remaining,period,playoffs,seconds_remaining,...,opponent_PHI,opponent_PHX,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS
0,1,,167,72,33.9723,-118.1028,10,1,0,27,...,0,0,1,0,0,0,0,0,0,0
1,2,0.0,-157,0,34.0443,-118.4268,10,1,0,22,...,0,0,1,0,0,0,0,0,0,0
2,3,1.0,-101,135,33.9093,-118.3708,7,1,0,45,...,0,0,1,0,0,0,0,0,0,0
3,4,0.0,138,175,33.8693,-118.1318,6,1,0,52,...,0,0,1,0,0,0,0,0,0,0
4,5,1.0,0,0,34.0443,-118.2698,6,2,0,19,...,0,0,1,0,0,0,0,0,0,0
5,6,0.0,-145,-11,34.0553,-118.4148,9,3,0,32,...,0,0,1,0,0,0,0,0,0,0
6,7,1.0,0,0,34.0443,-118.2698,8,3,0,52,...,0,0,1,0,0,0,0,0,0,0
7,8,,1,28,34.0163,-118.2688,8,3,0,5,...,0,0,1,0,0,0,0,0,0,0
8,9,1.0,-65,108,33.9363,-118.3348,6,3,0,12,...,0,0,1,0,0,0,0,0,0,0
9,10,0.0,-33,125,33.9193,-118.3028,3,3,0,36,...,0,0,1,0,0,0,0,0,0,0


In [370]:
k=dff[dff.shot_made_flag.isnull()]

In [371]:
ktest=k.iloc[:,2:]

### predict final

In [376]:
pred_k = m5.predict_proba(ktest)

In [377]:
k['shot_made_flag']=pred_k

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [378]:
k_final=k[['shot_id','shot_made_flag']]

In [379]:
k_final.to_csv("sample_submission.csv",index=False)