<a href="https://colab.research.google.com/github/hungpham89/AI4Code/blob/main/AI4Code_Competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download AI4Code
! unzip AI4Code.zip

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('')

In [None]:
#Choose how many record we want to train, early stage we can use small number to speed up the process
NUM_TRAIN = 100


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

#Create a list of file names 
paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN] #Using glob module to finds all the pathnames matching a specified pattern (here is *.json)

#Load individual json into df and store in a list
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
#Concat into 1 big df
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

df

Train NBs: 100%|██████████| 100/100 [00:00<00:00, 198.29it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
046d76ed9f0d38,22818d72,code,import os # File manipulation\nimport pandas as pd # Data manipulation\nimport numpy as np # Mathematics\nimport mat...
046d76ed9f0d38,cdacd6bc,code,file_path_list = [] # The list to contain the CSV file paths.\n# Walk through the input directory and loop through t...
046d76ed9f0d38,d244e425,code,crime_data = pd.DataFrame(data=None) # Make a new DataFrame object.\n# For each of the file paths in the file path l...
046d76ed9f0d38,38052583,code,crime_data.shape
046d76ed9f0d38,441e0ca7,code,crime_data.info()
...,...,...,...
ff2ae3b0f2a417,0752e35e,code,!pip install git+https://github.com/glmcdona/LuxPythonEnvGym.git@main
ff2ae3b0f2a417,488e48e2,code,import time\nfrom luxai2021.game.game import Game\nfrom luxai2021.game.actions import *\nfrom luxai2021.game.constan...
ff2ae3b0f2a417,80c67b3a,markdown,## Test the original LuxAI2021 engine (TypeScript driven)
ff2ae3b0f2a417,b12c720c,markdown,# Lux AI 2021 python game engine performance comparison to original\nSee https://github.com/glmcdona/LuxPythonEnvGym...


In [None]:
# Get an example notebook
nb_id = df.index.unique('id')[6]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()

Notebook: 11255201b53ada
The disordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4c73ee23,code,import numpy as np\nimport pandas as pd\nimport plotly.express as px\nimport plotly.graph_objs as go\nimport matplot...
9b92a04a,code,"from plotly.offline import download_plotlyjs , init_notebook_mode , plot , iplot"
2bb62912,code,init_notebook_mode(connected=True)
dd660a37,code,%matplotlib inline
c74ae584,code,df = pd.read_csv('../input/pizza-restaurants-and-the-pizza-they-sell/8358_1.csv')
5fe7ca2c,code,#df.info\n#df.head(5)
feb8871b,code,top_pizza_type = pd.DataFrame(data=df['menus.name'].value_counts().head(10))
fe667680,code,top_pizza_type['type'] = top_pizza_type.index
eeceb23a,code,"fig,ax = plt.subplots(figsize = (18,8))\nsns.barplot(x='type' , y= 'menus.name' , data=top_pizza_type , ax=ax)"
f2a7e17e,code,"top_pizza_cities = df['city'].value_counts().head(15)\nfig,ax = plt.subplots(figsize = (12,6))\ntop_pizza_cities.plo..."





In [None]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310c80, 073e27e5, 015d52a4, ad7679ef, 7fde4f04, 07c52510, 0a1a7a39, 0bcd3...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279279, df6c939f, 2476da96, 00f87d0a, ae93e8e6, 58aadb1d, d20b0094, 986fd...
0002115f48f982                                 [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe576, a3188e54, b3f6e12d, ee7655ca, 84125b7a]
                                                                           ...                                                           
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba953ee, bf92a015, f4a0492a, 095812e6, 53125cfe, aa32a700, 63340e73, 06d8c...
fffc3b44869198    [978a5137, fa

In [None]:
# Get the correct order 
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
#Use the order list as index
nb.loc[cell_order, :]


The ordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4c73ee23,code,import numpy as np\nimport pandas as pd\nimport plotly.express as px\nimport plotly.graph_objs as go\nimport matplot...
9b92a04a,code,"from plotly.offline import download_plotlyjs , init_notebook_mode , plot , iplot"
2bb62912,code,init_notebook_mode(connected=True)
dd660a37,code,%matplotlib inline
c74ae584,code,df = pd.read_csv('../input/pizza-restaurants-and-the-pizza-they-sell/8358_1.csv')
5fe7ca2c,code,#df.info\n#df.head(5)
feb8871b,code,top_pizza_type = pd.DataFrame(data=df['menus.name'].value_counts().head(10))
fe667680,code,top_pizza_type['type'] = top_pizza_type.index
eeceb23a,code,"fig,ax = plt.subplots(figsize = (18,8))\nsns.barplot(x='type' , y= 'menus.name' , data=top_pizza_type , ax=ax)"
f2a7e17e,code,"top_pizza_cities = df['city'].value_counts().head(15)\nfig,ax = plt.subplots(figsize = (12,6))\ntop_pizza_cities.plo..."


# New Section

In [None]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

In [None]:
from pandas.testing import assert_frame_equal

assert_frame_equal(nb.loc[cell_order, :], nb.sort_values('rank'))

In [None]:

df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

In [None]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors

In [None]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Training set
tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train['source'].astype(str))
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [None]:
# Add code cell ordering
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
print(X_train.shape)