# Meta-Kaggle ML example

Goal: predict user ranking from
 - first and last submission date

Features to add:
 - number of submissions made to the contest
 - number of team members
 - number of contests participated in (by leader or whole team?)
 - length of team name

Other things to add:
 - improve prediction: instead of raw ranking normalize by total number after dropping inactives (single submissions?), try predicting medaled or not (too imbalanced?)
 - drop dublicates
 
Currently it always predicts the same thing... That is not great.

In [2]:
import pandas as pd
import glob
import sklearn.ensemble as ske
import meta_kaggle_utils as utils
import numpy as np

In [3]:
data_location = 'meta-kaggle/'
submission_file_name = 'Submissions.csv'
team_file_name = 'Teams.csv'

## load submissions and teams files

In [4]:
# load the submission file
submissions = utils.load_kaggle_csv(data_location + submission_file_name)
# convert scores to numeric values
submissions['PublicScoreFullPrecision'] = pd.to_numeric(submissions['PublicScoreFullPrecision'], errors='coerce')
submissions['PrivateScoreFullPrecision'] = pd.to_numeric(submissions['PrivateScoreFullPrecision'], errors='coerce')

  if (yield from self.run_code(code, result)):
  mask |= (ar1 == a)


The file contains 4837705 rows.
The table contains the following columns: 
['SubmittedUserId' 'TeamId' 'SourceKernelVersionId' 'SubmissionDate'
 'ScoreDate' 'IsAfterDeadline' 'PublicScoreLeaderboardDisplay'
 'PublicScoreFullPrecision' 'PrivateScoreLeaderboardDisplay'
 'PrivateScoreFullPrecision']


In [5]:
# load the teams file
teams = utils.load_kaggle_csv(data_location + team_file_name)
# drop teams that never submitted anything
teams = teams.dropna(axis=0, how='any')

  if (yield from self.run_code(code, result)):


The file contains 1656073 rows.
The table contains the following columns: 
['CompetitionId' 'TeamLeaderId' 'TeamName' 'ScoreFirstSubmittedDate'
 'LastSubmissionDate' 'PublicLeaderboardSubmissionId'
 'PrivateLeaderboardSubmissionId' 'IsBenchmark' 'Medal' 'MedalAwardDate'
 'PublicLeaderboardRank' 'PrivateLeaderboardRank']


In [6]:
# convert dates to datetime objects
print('type of dates before and after conversion to timestamps: ')
print(type(teams.ScoreFirstSubmittedDate[497]))
teams.ScoreFirstSubmittedDate = pd.to_datetime(teams.ScoreFirstSubmittedDate)
print(type(teams.ScoreFirstSubmittedDate[497]))

# repeate with last date
teams.LastSubmissionDate = pd.to_datetime(teams.LastSubmissionDate)


type of dates before and after conversion to timestamps: 
<class 'str'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [7]:
def date_to_day(dates):
    """ calculate a numeric value for a pandas series of dates"""
    dayofyear = dates.dt.dayofyear
    year = dates.dt.year

    # subtract off the first year and calculate the days
    year = year - min(year)
    day = dayofyear + year * 365
    return day

In [8]:
# convert date to a number of days after Jan 1st of the first year of data in the dataset
days = date_to_day(teams.ScoreFirstSubmittedDate)
teams['first_date_as_day'] = days

print('dates as timestamps:')
print(teams.ScoreFirstSubmittedDate[0:5])
print('dates as numbers:')
print(days[0:5])

# repeate for the last date
teams['last_date_as_day'] = date_to_day(teams.LastSubmissionDate)

dates as timestamps:
Id
497   2010-04-30
500   2010-05-02
503   2010-05-05
504   2010-05-11
505   2010-05-19
Name: ScoreFirstSubmittedDate, dtype: datetime64[ns]
dates as numbers:
Id
497    120
500    122
503    125
504    131
505    139
Name: ScoreFirstSubmittedDate, dtype: int64


In [9]:
print('teams matrix shape: ', teams.shape)
teams.head()

teams matrix shape:  (22234, 14)


Unnamed: 0_level_0,CompetitionId,TeamLeaderId,TeamName,ScoreFirstSubmittedDate,LastSubmissionDate,PublicLeaderboardSubmissionId,PrivateLeaderboardSubmissionId,IsBenchmark,Medal,MedalAwardDate,PublicLeaderboardRank,PrivateLeaderboardRank,first_date_as_day,last_date_as_day
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
497,2435,619.0,jonp,2010-04-30,2010-04-30,2182.0,2182.0,False,3.0,07/15/2016,41.0,25.0,120,120
500,2435,673.0,Thylacoleo,2010-05-02,2010-07-10,2187.0,2187.0,False,3.0,07/15/2016,31.0,23.0,122,191
503,2435,672.0,Fontanelles,2010-05-05,2010-05-08,2199.0,2199.0,False,3.0,07/15/2016,6.0,31.0,125,128
504,2435,727.0,IFM_bioinformatics,2010-05-11,2010-05-12,2203.0,2246.0,False,1.0,07/15/2016,13.0,9.0,131,132
505,2435,728.0,Amsterdam,2010-05-19,2010-05-19,2306.0,2308.0,False,2.0,07/15/2016,18.0,11.0,139,139


## merge submission and teams files

In [10]:
# add a joining column to the submissions matrix
submissions['join_teams_submissions'] = submissions.index
submissions['PublicLeaderboardScore'] = submissions.PublicScoreFullPrecision
submissions['PrivateLeaderboardScore'] = submissions.PrivateScoreFullPrecision

In [11]:
# extract the column to add (and keep column created for the join)
public_leaderboard = submissions[['join_teams_submissions', 'PublicLeaderboardScore']]
private_leaderboard = submissions[['join_teams_submissions', 'PrivateLeaderboardScore']]

In [12]:
# create a column with the same name in the teams matrix
teams['join_teams_submissions'] = teams.PublicLeaderboardSubmissionId

In [13]:
# perform the merge
teams_with_score = teams.join(public_leaderboard, on='join_teams_submissions', rsuffix='_j1')
teams_with_score = teams_with_score.join(private_leaderboard, on='join_teams_submissions', rsuffix='_j2')
print('shape of the team, public_leaderboard, and new teams_with_score data frames:')
print(teams.shape, public_leaderboard.shape, teams_with_score.shape)

# drop the columns added for joining
drop_cols = ['join_teams_submissions_j1', 'join_teams_submissions_j2', 'join_teams_submissions']
teams_with_score = teams_with_score.drop(drop_cols, axis=1)
print('shape of teams_with_score matrix after dropping redundant columns: ')
print(teams_with_score.shape)

shape of the team, public_leaderboard, and new teams_with_score data frames:
(22234, 15) (4837705, 2) (22234, 19)
shape of teams_with_score matrix after dropping redundant columns: 
(22234, 16)


## set up the matrix for the ML model

In [14]:
print('columns available for use: ')
print(teams_with_score.columns.values)


columns available for use: 
['CompetitionId' 'TeamLeaderId' 'TeamName' 'ScoreFirstSubmittedDate'
 'LastSubmissionDate' 'PublicLeaderboardSubmissionId'
 'PrivateLeaderboardSubmissionId' 'IsBenchmark' 'Medal' 'MedalAwardDate'
 'PublicLeaderboardRank' 'PrivateLeaderboardRank' 'first_date_as_day'
 'last_date_as_day' 'PublicLeaderboardScore' 'PrivateLeaderboardScore']


In [15]:
# select feature column. Must be in the list above
predict_col = 'PrivateLeaderboardRank'
print('selected feature column: ' + predict_col)

# select fetture columns to use. These must be selected from the list above
feature_cols = ['CompetitionId', 'first_date_as_day', 'last_date_as_day']
feature_matrix = teams_with_score[feature_cols]

# make sure things are the right shapes
print(feature_matrix.shape, 'shape of feature matrix')
prediction = teams_with_score[predict_col]
print(len(prediction), 'length of predictions')
print('Two of the numbers above should be the same.')

selected feature column: PrivateLeaderboardRank
(22234, 3) shape of feature matrix
22234 length of predictions
Two of the numbers above should be the same.


In [16]:
# TODO: FIX THIS - CURRENTLY IF ANYTHING IS DROPPED IT WILL BREAK because it is not also dropping from the 
#   predictions so they won't be the same length.


# nan's will break the regression, so drop them
print('feature matrix shape before and after droping missing values')
# If it gets much smaller, something is not working well. You might need to impute 
#   missing values (or look for a bug in your code)
print(feature_matrix.shape)
feature_matrix = feature_matrix.dropna(how='any')
print(feature_matrix.shape)

feature matrix shape before and after droping missing values
(22234, 3)
(22234, 3)


In [17]:
print('start of feature matrix:')
print(feature_matrix.head())
print('start of prediction matrix')
print(prediction.head())

start of feature matrix:
     CompetitionId  first_date_as_day  last_date_as_day
Id                                                     
497           2435                120               120
500           2435                122               191
503           2435                125               128
504           2435                131               132
505           2435                139               139
start of prediction matrix
Id
497    25.0
500    23.0
503    31.0
504     9.0
505    11.0
Name: PrivateLeaderboardRank, dtype: float64


## Create the ML model

In [18]:
# run a random forest regression
regr = ske.RandomForestRegressor(max_depth=2, random_state=0,
                             n_estimators=100)
regr.fit(feature_matrix, prediction)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

## Look at the results

In [19]:
print('features and their importance')
print(feature_matrix.columns.values)
print(regr.feature_importances_)

a = [2435, 130, 132]
print('prediction for', a)
print(regr.predict([a]))
a = [2435, 140, 250]
print('prediction for', a)
print(regr.predict([a]))
a = [4495, 130, 132]
print('prediction for', a)
print(regr.predict([a]))

features and their importance
['CompetitionId' 'first_date_as_day' 'last_date_as_day']
[0.99238626 0.00344092 0.00417282]
prediction for [2435, 130, 132]
[46.23803527]
prediction for [2435, 140, 250]
[46.23803527]
prediction for [4495, 130, 132]
[81.42070571]


## Create a model for the single largest competition
It should be an easier problem if all of the data is from the same competition

In [27]:
# find the most common competition
num_occur = feature_matrix.CompetitionId.value_counts()
print(num_occur.iloc[0:5])
competition_use = num_occur.index.values[0]

7082    515
4986    512
8076    455
6565    383
6649    377
Name: CompetitionId, dtype: int64
7082


In [38]:
# select the data from that commetition
features_competition = feature_matrix[feature_matrix.CompetitionId == competition_use]
features_competition = features_competition.drop('CompetitionId', axis=1)
predict_competition = prediction[feature_matrix.CompetitionId == competition_use]

# make sure things are the right shapes
print(features_competition.shape, 'shape of feature matrix')
print(len(predict_competition), 'length of predictions')
print('Two of the numbers above should be the same.')

(515, 2) shape of feature matrix
515 length of predictions
Two of the numbers above should be the same.


In [39]:
# run a random forest regression
regr2 = ske.RandomForestRegressor(max_depth=2, random_state=0,
                             n_estimators=100)
regr2.fit(features_competition, predict_competition)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [45]:
# look at the results
print('features and their importance')
print(features_competition.columns.values)
print(regr2.feature_importances_)

a = [2888, 2889]
print('prediction for', a)
print(regr2.predict([a]))
a = [2888, 2905]
print('prediction for', a)
print(regr2.predict([a]))
a = [2950, 2982]
print('prediction for', a)
print(regr2.predict([a]))


features and their importance
['first_date_as_day' 'last_date_as_day']
[0.58257537 0.41742463]
prediction for [2888, 2889]
[231.44241639]
prediction for [2888, 2905]
[231.44241639]
prediction for [2950, 2982]
[231.44241639]


In [47]:
print(features_competition.iloc[0:10])
print(features_competition.iloc[-10:])

        first_date_as_day  last_date_as_day
Id                                         
953151               2883              2888
953171               2888              2888
953176               2888              2888
953179               2887              2887
953184               2888              2888
953190               2885              2888
953217               2888              2888
953260               2884              2884
953303               2888              2888
953351               2888              2888
         first_date_as_day  last_date_as_day
Id                                          
1120859               2887              2888
1121170               2888              2888
1121904               2888              2888
1121907               2885              2885
1122260               2885              2886
1122497               2888              2888
1123362               2888              2888
1123509               2888              2888
1125527               

## Adding more features
I would like to add 
- the number of submissions
- the number of teammembers
- length of team name
- number of contests participated in (by leader or whole team?)