<a href="https://colab.research.google.com/github/jasonsgraham/nlp_notes/blob/main/nlp_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup #

In [2]:
#@title Default title text
!wget -q https://gist.githubusercontent.com/jasonsgraham/f63e1737121e2154ee3ad228398137e2/raw/7f0bb2c8e756e43cabcd90027dedfb79670132ae/setup_colab.py -O colab_setup.py
%run colab_setup.py

Loading WANDB api key.


In [4]:
%%sh
pip install -q --upgrade transformers
pip install -q --upgrade wandb
pip install -q --upgrade mlflow

Check if notebook is running in Colab or Kaggle

In [5]:
try:
    import mlflow
except ImportError as e:
    !pip install mlflow
    import mlflow

In [6]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

In [7]:
import sys
GOOGLE_COLAB = 'google.colab' in sys.modules

if GOOGLE_COLAB:
  data_dir = Path('/content/drive/MyDrive/Colab Notebooks/input/AI4Code')
  output_dir = Path('/content/drive/MyDrive/Colab Notebooks/output/AI4Code')
  train_parquet_file = data_dir / 'train.parquet'
else:
  data_dir = data_dir = Path('../input/AI4Code')
  output_dir = Path('./')

In [None]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders

In [10]:
NUM_TRAIN = 10000


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

if train_parquet_file.exists():
  df = pd.read_parquet(train_parquet_file)
else:
  paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
  notebooks_train = [
      read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
  ]
  df = (
      pd.concat(notebooks_train)
      .set_index('id', append=True)
      .swaplevel()
      .sort_index(level='id', sort_remaining=False)
  )
  df.to_parquet(train_parquet_file)

In [15]:
df_orders.shape

(139256,)

In [None]:
df

In [19]:
nb_id = df.index.unique('id')[6]
# Get the correct order
cell_order = df_orders.loc[nb_id]
nb = df.loc[nb_id, :]
print("The ordered notebook:")
#nb.loc[cell_order, :]
cell_order

The ordered notebook:


['3e551fb7',
 '45049ad8',
 '8bb41691',
 '123b4f4c',
 '0b92cb59',
 '5a8b6e2d',
 'df963df4',
 '3c7d19bc',
 '0f3db81b',
 'eadf5c66',
 '33ff3073',
 '6cfbe868',
 '88cc83b2',
 '818c4c15']

In [20]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3e551fb7,0,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
45049ad8,1,code,"train_data = pd.read_csv(""/kaggle/input/titanic/train.csv"")\ntest_data = pd.read_csv(""/kaggle/input/titanic/test.csv"")"
123b4f4c,3,code,import plotly.express as px
0b92cb59,4,code,train_data.head(20)
df963df4,6,code,train_data.isnull().sum() #checking out which column has most no. of NaN Values
0f3db81b,8,code,"px.bar(data_frame=train_data, x='Sex', y='Survived',color='Sex',facet_row_spacing=0, title=""Relation between Gender ..."
33ff3073,10,code,"total_passengers = train_data['Sex'].count()\ncount_males = 0\ncount_females = 0\nfor i,j in zip(train_data['Sex'], ..."
818c4c15,13,code,"from sklearn.ensemble import RandomForestClassifier\n\n\ny = train_data[""Survived""]\n\nfeatures = [""Pclass"", ""Sex"", ..."
6cfbe868,11,markdown,## Survival Rate for Male Passenger is : 12.235 %\n\n## Survival Rate for Female Passenger is : 26.150 %
eadf5c66,9,markdown,## Who has more luck in here? \n\n\nFrom the above data we can find out that females had more survival rate on Titan...


In [22]:
from pandas.testing import assert_frame_equal

assert_frame_equal(nb.loc[cell_order, :], nb.sort_values('rank'))

In [23]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
00001756c60be8,1862f0a6,0
00001756c60be8,2a9e43d6,2
00001756c60be8,038b763d,4
00001756c60be8,2eefe0ef,6
00001756c60be8,0beab1cd,8
...,...,...
12b925c525495d,84762508,17
12b925c525495d,bb270083,21
12b925c525495d,473e430f,14
12b925c525495d,71181d6d,4


In [24]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')

In [25]:
df_ancestors

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df
0001bdd4021779,a7711fde,
0001daf4c2c76d,090152ca,
0002115f48f982,272b483a,
...,...,...
fffc30d5a0bc46,6aed207b,
fffc3b44869198,a6aaa8d7,
fffc63ff750064,0a1b5b65,
fffcd063cda949,d971e960,


In [26]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

In [32]:
df_valid

Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
000757b90aaca0,8f84d7a9,code,import pandas as pd\nimport spacy\nimport networkx as nx # a really useful network analysis l...
000757b90aaca0,eb6ca769,code,nlp = spacy.load('en_core_web_lg') # A more detailed model (with higher-dimension word vectors) - 13s to l...
000757b90aaca0,bc595bc2,code,"plt.rcParams['figure.figsize'] = [10, 10] # makes the output plots large enough to be useful"
000757b90aaca0,93cceeef,code,rowlimit = 500 # this limits the tweets to a manageable number\ndata = pd.read_csv('../input/ExtractedT...
000757b90aaca0,3cb3d383,code,data.head(6)
...,...,...,...
1292c88558dbc8,15290200,markdown,# 2. Import Datasets
1292c88558dbc8,affda817,markdown,# Data Dictionary
1292c88558dbc8,0d9947c2,markdown,<b>We notice from the plot most the word frequancies are the common word. and that's will not help us to understand ...
1292c88558dbc8,af4b2ad7,markdown,## 3.5 Print Selective Rows from Non-Toxic Comments


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Training set
tfidf = TfidfVectorizer(min_df=0.01)
X_train = tfidf.fit_transform(df_train['source'].astype(str))
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [30]:
groups

array([58, 93, 13, ..., 87, 31, 26])

Now let's add the code cell ordering as a feature. We'll append a column that enumerates the code cells in the correct order, like `1, 2, 3, 4, ...`, while having the dummy value `0` for all markdown cells. This feature will help the model learn to put the code cells in the correct order.

In [33]:
code=nb.loc[nb.cell_type=="code"]

In [34]:
code

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3e551fb7,0,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
45049ad8,1,code,"train_data = pd.read_csv(""/kaggle/input/titanic/train.csv"")\ntest_data = pd.read_csv(""/kaggle/input/titanic/test.csv"")"
123b4f4c,3,code,import plotly.express as px
0b92cb59,4,code,train_data.head(20)
df963df4,6,code,train_data.isnull().sum() #checking out which column has most no. of NaN Values
0f3db81b,8,code,"px.bar(data_frame=train_data, x='Sex', y='Survived',color='Sex',facet_row_spacing=0, title=""Relation between Gender ..."
33ff3073,10,code,"total_passengers = train_data['Sex'].count()\ncount_males = 0\ncount_females = 0\nfor i,j in zip(train_data['Sex'], ..."
818c4c15,13,code,"from sklearn.ensemble import RandomForestClassifier\n\n\ny = train_data[""Survived""]\n\nfeatures = [""Pclass"", ""Sex"", ..."


In [38]:
import tokenize
import io

code.loc['33ff3073','source']

code_text = tokenize.generate_tokens(io.StringIO(code.loc['33ff3073','source']).readline)
[tok for tok in code_text]

[TokenInfo(type=1 (NAME), string='total_passengers', start=(1, 0), end=(1, 16), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=53 (OP), string='=', start=(1, 17), end=(1, 18), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=1 (NAME), string='train_data', start=(1, 19), end=(1, 29), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=53 (OP), string='[', start=(1, 29), end=(1, 30), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=3 (STRING), string="'Sex'", start=(1, 30), end=(1, 35), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=53 (OP), string=']', start=(1, 35), end=(1, 36), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=53 (OP), string='.', start=(1, 36), end=(1, 37), line="total_passengers = train_data['Sex'].count()\n"),
 TokenInfo(type=1 (NAME), string='count', start=(1, 37), end=(1, 42), line="total_passengers = train_data['Sex'].count()\

In [41]:
# Extract only function names, variables, comments then we can join them
code_text = tokenize.generate_tokens(io.StringIO(code.loc['33ff3073','source']).readline)
[tok.string for tok in code_text if tok.type==53 or tok.type==55]

['=',
 '[',
 ']',
 '.',
 '(',
 ')',
 '=',
 '=',
 ',',
 '(',
 '[',
 ']',
 ',',
 '[',
 ']',
 ')',
 ':',
 '==',
 '==',
 ':',
 '+=',
 '==',
 '==',
 ':',
 '+=',
 '(',
 ')',
 '(',
 '(',
 '/',
 ')',
 '*',
 ',',
 ')',
 '(',
 ')',
 '(',
 '(',
 '/',
 ')',
 '*',
 ',',
 ')']

In [42]:
# Add code cell ordering
X_train = sparse.hstack((
    X_train,
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

print(X_train.shape)

(416710, 282)
