In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120
from pandas.testing import assert_frame_equal
from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer


data_dir = Path('../input/AI4Code')

In [2]:
NUM_TRAIN = 20000


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df

Train NBs: 100%|██████████| 20000/20000 [04:00<00:00, 83.10it/s] 


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
0001bdd4021779,3fdc37be,code,import pandas as pd\nimport numpy as np \nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport warnings as...
0001bdd4021779,073782ca,code,"df = pd.read_csv(""/kaggle/input/us-police-shootings/shootings.csv"")"
0001bdd4021779,8ea7263c,code,df.head()
0001bdd4021779,80543cd8,code,df.isna().sum()
0001bdd4021779,38310c80,code,"sns.set()\nsns.countplot(df[""gender""])\nplt.title(""The Gender of Killed Person"")\nplt.show()\n"
...,...,...,...
fffcd063cda949,fb7456dd,markdown,### find a proper learning rate
fffcd063cda949,055e0d2e,markdown,### Show an image with label
fffcd063cda949,4f671884,markdown,### Show random transformations of the same image
fffcd063cda949,28f8bc15,markdown,### Create a databunch instance


In [3]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310c80, 073e27e5, 015d52a4, ad7679ef, 7fde4f04, 07c52510, 0a1a7a39, 0bcd3...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279279, df6c939f, 2476da96, 00f87d0a, ae93e8e6, 58aadb1d, d20b0094, 986fd...
0002115f48f982                                 [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe576, a3188e54, b3f6e12d, ee7655ca, 84125b7a]
                                                                           ...                                                           
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba953ee, bf92a015, f4a0492a, 095812e6, 53125cfe, aa32a700, 63340e73, 06d8c...
fffc3b44869198    [978a5137, fa

In [4]:
nb_id = df.index.unique('id')[6]
nb = df.loc[nb_id, :]
# Get the correct order
cell_order = df_orders.loc[nb_id]
nb.loc[cell_order, :]

Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
769e7067,code,import pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport os\nimport warnings\nwarnings.filte...
1d3c29ce,code,df.dtypes
239faa9b,code,df.head(10)
33edf629,code,df['sex'].value_counts()
e1911a5d,code,df['smoker'].value_counts()
f4600331,markdown,Looks like there are alot of non-smokers in our datset whereas for gender it is almost close. Before any analysis we...
9c8ae28c,code,df['smoker']=df['smoker'].apply(lambda x: 0 if x=='no' else 1)\ndf['sex']=df['sex'].apply(lambda x: 0 if x=='female'...
b228b07f,code,df.head()
927dbf5f,markdown,"First of all, i have less experience in visualization and learned about seaborn after writing this block"
c01cf011,code,p=np.arange(len(df['smoker'].unique()))\nsum_of_smokers=len(df['smoker'])\nnon_smokers=0\nsmokers=0\nfor x in df['sm...


In [5]:
def getRanking(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = getRanking(cell_order, nb.index.tolist())
nb.insert(0, 'rank', cell_ranks)

nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
769e7067,0,code,import pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport os\nimport warnings\nwarnings.filte...
1d3c29ce,1,code,df.dtypes
239faa9b,2,code,df.head(10)
33edf629,3,code,df['sex'].value_counts()
e1911a5d,4,code,df['smoker'].value_counts()
9c8ae28c,6,code,df['smoker']=df['smoker'].apply(lambda x: 0 if x=='no' else 1)\ndf['sex']=df['sex'].apply(lambda x: 0 if x=='female'...
b228b07f,7,code,df.head()
c01cf011,9,code,p=np.arange(len(df['smoker'].unique()))\nsum_of_smokers=len(df['smoker'])\nnon_smokers=0\nsmokers=0\nfor x in df['sm...
135fd855,10,code,"percentage_of_smokers= ""{0:.2f}"".format((smokers/float(sum_of_smokers))*100)\npercentage_of_non_smokers= ""{0:.2f}"".f..."
88b80d52,11,code,"plt.bar(p,sm,color = ['r','b'])\nplt.xticks(p,[""Non-Smokers"",""Smokers""])\nplt.text(0, 500 ,percentage_of_smokers+'%'..."


In [6]:
assert nb.loc[cell_order].equals(nb.sort_values('rank'))

In [7]:
# Joining df_orders with cell_id lists based on id
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right'
)

# Calculating ranks for each id
ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': getRanking(cell_order, cell_id)}

# Creating DataFrame from ranks dictionary
df_ranks = pd.DataFrame.from_dict(ranks, orient='index') \
                       .rename_axis('id') \
                       .apply(pd.Series.explode) \
                       .set_index('cell_id', append=True)

df_ranks


Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
0001bdd4021779,3fdc37be,0
0001bdd4021779,073782ca,1
0001bdd4021779,8ea7263c,2
0001bdd4021779,80543cd8,3
0001bdd4021779,38310c80,4
...,...,...
fffcd063cda949,fb7456dd,28
fffcd063cda949,055e0d2e,13
fffcd063cda949,4f671884,22
fffcd063cda949,28f8bc15,19


In [8]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,
...,...,...
fffc30d5a0bc46,6aed207b,
fffc3b44869198,a6aaa8d7,
fffc63ff750064,0a1b5b65,
fffcd063cda949,d971e960,


In [9]:
NVALID = 0.2  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Splitting the data, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']

ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

# Creating train and validation dataframes
df_train = df.loc[ids_train]
df_valid = df.loc[ids_valid]


In [10]:
# Training set
tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train['source'].astype(str))
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [None]:
# Add code cell ordering
code_cell_order = np.where(
    df_train['cell_type'] == 'code',
    df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
    0
).reshape(-1, 1)

X_train = sparse.hstack((X_train, code_cell_order))
print(X_train.shape)


(735729, 284)


In [12]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=12,
    subsample=0.7,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=10, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

In [13]:
# Validation set
X_valid = tfidf.transform(df_valid['source'].astype(str))
# The metric uses cell ids
y_valid = df_orders.loc[ids_valid]

X_valid = sparse.hstack((
    X_valid,
    np.where(
        df_valid['cell_type'] == 'code',
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [14]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)
y_pred.head(10)

id
0008ba887b3817    [006235ba, f643db5c, bf698053, 7503fa66, 0f8b6578, e459498c, f526bf2d, 4edde13d, aadc7e71, 0f25fcd4, f47c6acf, 8c752...
0023886d8f785a    [ba4a7d37, 7327e332, 684bf776, 10cf8639, cc785f1d, 124b2390, d68ed4f1, 1c298963, 9a5b323e, 1fab8ac2, c340bbb5, efea9...
0023fe53ace4bb    [5214fe1e, 0b04591a, 4287afca, 9fe41d6a, d91114cd, de9c3eec, 8c8b5492, 06925db7, 2572559e, ee638789, 266ea54e, 46f9d...
00275db0185bb5    [156a75c8, 3a248a59, c28a1781, 4f6569d1, 07dabe1b, 54f8f652, 2f68afa6, 3a744d01, 772c6dbd, 775f2299, e7c2268e, ed7fa...
002aed65301beb    [48716f4c, 0823d03c, 0d2f52d8, f43790ac, 0c0d4bdb, cfd059d2, 4a0a07d6, 47c64160, d6fb2a94, 7b4881a9, b2fc391b, bbae1...
002e8d4d495b57    [8eb7c9c8, fa988a31, aa0d2c3e, 21631ecf, 7fdf2bef, 381c99d0, 1fb18315, f7b9bfbc, 01ff8860, cd05a32e, 82b3c7e7, 7442b...
00478fd20787a7                                                     [5925c561, d0b0660f, 2eb29dd9, fc72c332, ece9f11a, 5aaab0a0, e57a9cdc]
006bacc7c7929b    [1b7dd7ae, 43

In [15]:
nb_id = df_valid.index.get_level_values('id').unique()[8]

display(df.loc[nb_id])
display(df.loc[nb_id].loc[y_pred.loc[nb_id]])

Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
a2a0cfc1,code,import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom datetime impor...
4691c7a2,code,"df = pd.read_csv(""../input/tabular-playground-series-jan-2022/train.csv"", index_col = 'row_id')"
ecd67bfc,code,df.head()
cf5456ae,code,df.info()
45540435,code,df.describe()
...,...,...
39090d2e,markdown,### Increase in the selling numbers can be seen on weekends (Day 5 and Day 6)
7978769b,markdown,## Data Cleaning and feature engineering
e62bf20c,markdown,"### January, April and December saw increase in selling numbers and kaggle hat was leading the way.\n### Reason coul..."
bbe85f1c,markdown,### Norway has the highest num sold for all the products.\n### Finland has the lowest num sold\n### Kaggle hat is so...


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
a2a0cfc1,code,import pandas as pd\nimport numpy as np\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom datetime impor...
4691c7a2,code,"df = pd.read_csv(""../input/tabular-playground-series-jan-2022/train.csv"", index_col = 'row_id')"
ecd67bfc,code,df.head()
cf5456ae,code,df.info()
45540435,code,df.describe()
...,...,...
76661980,code,"output = np.ceil(inv_boxcox(y_pred, lam))"
2dceb6a9,code,yp = model.predict(X)
af9d2bfe,code,"data1 = pd.DataFrame({'row_id': df_test.index,\n 'num_sold': output})"
a182f341,code,y_pred = model.predict(df_test)


In [16]:
from bisect import bisect


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [17]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy)

0.4108294551543308

In [18]:
kendall_tau(y_valid, y_pred)

0.5912936052542466