# Meta-Kaggle ML example

Goal - predict user ranking from:
 - first and last submission date
 - team leader tier (experience level)


In [1]:
def date_to_day(dates):
    """ calculate a numeric value for a pandas series of dates"""
    dayofyear = dates.dt.dayofyear
    year = dates.dt.year

    # subtract off the first year and calculate the days
    year = year - min(year)
    day = dayofyear + year * 365
    return day

In [2]:
def ml_drop(feature, predict):
    """ 
    Drop columns with missing values from the feature dataframe. 
    Keeps the predict series aligned by dropping the same entries.
    """
    # combine dataframes
    tmp_features = pd.concat([feature, predict], axis=1)
    # drop missing values
    tmp_features = tmp_features.dropna(how='any')
    # drop duplicate rows
    tmp_features = tmp_features.drop_duplicates()
    # reseparate features from prediction values
    final_predict = tmp_features.iloc[:,-1]
    final_feature = tmp_features.iloc[:,0:-1]
    return final_feature, final_predict

In [3]:
def add_scores_to_teams(submissions, teams, verbose=False):
    """ 
    Adds the public and private leaderboard scores to the teams matrix. Assumes 
    the columns have the names provided in the meta-kaggle csv files.
    """
    # add a joining column to the submissions matrix
    submissions['join_teams_submissions'] = submissions.index
    submissions['PublicLeaderboardScore'] = submissions.PublicScoreFullPrecision
    submissions['PrivateLeaderboardScore'] = submissions.PrivateScoreFullPrecision
    # extract the column to add (and keep column created for the join)
    public_leaderboard = submissions[['join_teams_submissions', 'PublicLeaderboardScore']]
    private_leaderboard = submissions[['join_teams_submissions', 'PrivateLeaderboardScore']]
    # create a column with the same name in the teams matrix
    teams['join_teams_submissions'] = teams.PublicLeaderboardSubmissionId
    # perform the merge
    teams_with_score = teams.join(public_leaderboard, on='join_teams_submissions', rsuffix='_j1')
    teams_with_score = teams_with_score.join(private_leaderboard, on='join_teams_submissions', rsuffix='_j2')
    if verbose:
        print('shape of the team, public_leaderboard, and new teams_with_score data frames:')
        print(teams.shape, public_leaderboard.shape, teams_with_score.shape)
    # drop the columns added for joining
    drop_cols = ['join_teams_submissions_j1', 'join_teams_submissions_j2', 'join_teams_submissions']
    teams_with_score = teams_with_score.drop(drop_cols, axis=1)
    if verbose:
        print('shape of teams_with_score matrix after dropping redundant columns: ')
        print(teams_with_score.shape)
    return teams_with_score

In [4]:
def add_leader_tier(users, teams, verbose=False):
    """ 
    Adds the tier (measure of kaggle experience) of the leader to the teams matrix. 
    Assumes the columns have the names provided in the meta-kaggle csv files.
    """
    # add a joining column 
    users['join'] = users.index
    # create a column with the same name in the teams matrix
    teams['join'] = teams.TeamLeaderId
    
    # perform the merge
    teams_with_tier = teams.join(users, on='join', rsuffix='_j1')
    if verbose:
        print('shape of the team, public_leaderboard, and new teams_with_score data frames:')
        print(teams.shape, users.shape, teams_with_tier.shape)
    # drop the columns added for joining
    drop_cols = ['join_j1', 'join', 'UserName', 'DisplayName']
    teams_with_tier = teams_with_tier.drop(drop_cols, axis=1)
    if verbose:
        print('shape of teams_with_score matrix after dropping redundant columns: ')
        print(teams_with_tier.shape)
        print('columns in final dataframe: ')
        print(teams_with_tier.columns.values)
    return teams_with_tier

In [5]:
import pandas as pd
import glob
import sklearn.ensemble as ske
import meta_kaggle_utils as utils
import numpy as np

## Set up file paths
If you unziped the meta-kaggle data file in a different folder, change data_location below.

In [6]:
data_location = 'meta-kaggle/'
submission_file_name = 'Submissions.csv'
team_file_name = 'Teams.csv'
users_file = 'Users.csv'

## load files

In [7]:
# load the users
users = utils.load_kaggle_csv(data_location + users_file)

  mask |= (ar1 == a)


The file contains 2919592 rows.
The table contains the following columns: 
['UserName' 'DisplayName' 'RegisterDate' 'PerformanceTier']


In [None]:
# load the submission file
submissions = utils.load_kaggle_csv(data_location + submission_file_name)
# convert scores to numeric values
submissions['PublicScoreFullPrecision'] = pd.to_numeric(submissions['PublicScoreFullPrecision'], errors='coerce')
submissions['PrivateScoreFullPrecision'] = pd.to_numeric(submissions['PrivateScoreFullPrecision'], errors='coerce')

  if self.run_code(code, result):
  mask |= (ar1 == a)


In [None]:
# load the teams file
teams = utils.load_kaggle_csv(data_location + team_file_name)
# drop teams that never submitted anything
teams = teams.dropna(axis=0, how='any')

In [None]:
# convert dates to datetime objects
print('type of dates before and after conversion to timestamps: ')
print(type(teams.ScoreFirstSubmittedDate[497]))
teams.ScoreFirstSubmittedDate = pd.to_datetime(teams.ScoreFirstSubmittedDate)
print(type(teams.ScoreFirstSubmittedDate[497]))
# repeate with last date
teams.LastSubmissionDate = pd.to_datetime(teams.LastSubmissionDate)

In [None]:
# convert date to a number of days after Jan 1st of the first year of data in the dataset
days = date_to_day(teams.ScoreFirstSubmittedDate)
teams['first_date_as_day'] = days

print('dates as timestamps:')
print(teams.ScoreFirstSubmittedDate[0:5])
print('dates as numbers:')
print(days[0:5])

# repeate for the last date
teams['last_date_as_day'] = date_to_day(teams.LastSubmissionDate)

In [None]:
print('teams matrix shape: ', teams.shape)
teams.head()

## Add features to teams matrix

In [None]:
teams_with_score = add_scores_to_teams(submissions, teams, verbose=True)

In [None]:
teams_with_tier = add_leader_tier(users, teams_with_score, verbose=True)

## set up the matrix for the ML model

In [None]:
print('columns available for use: ')
print(teams_with_tier.columns.values)


In [None]:
# select feature column. Must be in the list above
predict_col = 'PrivateLeaderboardRank'
print('selected feature column: ' + predict_col)

# select fetture columns to use. These must be selected from the list above
feature_cols = ['CompetitionId', 'first_date_as_day', 'last_date_as_day', 'PerformanceTier']

# actually select the things
feature_matrix = teams_with_tier[feature_cols]
prediction = teams_with_tier[predict_col]

In [None]:
# drop missing values & duplicates
print('feature matrix shape before and after droping missing values')
# If it gets much smaller, something is not working well. You might need to impute 
#   missing values (or look for a bug in your code)
print(feature_matrix.shape, prediction.shape)
feature_matrix, prediction = ml_drop(feature_matrix, prediction)
print(feature_matrix.shape, prediction.shape)

In [None]:
print('start of feature matrix:')
print(feature_matrix.head())
print('start of prediction matrix')
print(prediction.head())

In [None]:
# make sure things are the right shapes
print(feature_matrix.shape, 'shape of feature matrix')
print(len(prediction), 'length of predictions')
print('Two of the numbers above should be the same.')

## Create the ML model

In [None]:
# run a random forest regression
regr = ske.RandomForestRegressor(max_depth=2, random_state=0,
                             n_estimators=100, oob_score=True)
regr.fit(feature_matrix, prediction)

## Look at the results

In [None]:
print(regr.oob_score_, 'oob score')
print()
print('features and their importance')
print(feature_matrix.columns.values)
print(regr.feature_importances_)
print()

a = [2435, 130, 132, 1]
print('prediction for', a)
print(regr.predict([a]))
a = [4495, 130, 132, 1]
print('prediction for', a)
print(regr.predict([a]))

## Create a model for the single largest competition
It should be an easier problem if all of the data is from the same competition

In [None]:
# find the most common competition
num_occur = feature_matrix.CompetitionId.value_counts()
print(num_occur.iloc[0:10])


In [None]:
# competition_use = num_occur.index.values[0]
competition_use = 8076

In [None]:
# select the data from that commetition
features_competition = feature_matrix[feature_matrix.CompetitionId == competition_use]
features_competition = features_competition.drop('CompetitionId', axis=1)
predict_competition = prediction[feature_matrix.CompetitionId == competition_use]

# make sure things are the right shapes
print(features_competition.shape, 'shape of feature matrix')
print(len(predict_competition), 'length of predictions')
print('Two of the numbers above should be the same.')

In [None]:
# run a random forest regression
regr2 = ske.RandomForestRegressor(max_depth=2, random_state=0,
                             n_estimators=100, oob_score=True)
regr2.fit(features_competition, predict_competition)

In [None]:
## look at the results
print(regr2.oob_score_, 'oob score')

print('features and their importance')
print(features_competition.columns.values)
print(regr2.feature_importances_)

a = [2888, 2889, 1]
print('prediction for', a)
print(regr2.predict([a]))
a = [2888, 2889, 5]
print('prediction for', a)
print(regr2.predict([a]))
a = [2950, 2982, 1]
print('prediction for', a)
print(regr2.predict([a]))


In [None]:
# used to figure out what reasonable values are for new predictions
print(features_competition.iloc[0:10])
print(features_competition.iloc[-10:])

## Adding more features

Features that could be added:
- the number of submissions
- the number of team members
- number of contests participated in
- number of forum posts
- number of kernels published
