# dataset
Gather project IDs for vectorization task.

### import dependencies

In [1]:
!pip install -q --no-cache-dir mysqlclient

import os
import pandas as pd
import numpy as np

### settings & configuration

In [2]:
# Dataset paths
DATASET = './dataset'
DATASET_TARGET = os.path.abspath(os.path.join(DATASET, 'dataset.txt'))
DATASET_IDS_TARGET = os.path.abspath(os.path.join(DATASET, 'dataset.ids'))
TRAIN_TARGET = os.path.abspath(os.path.join(DATASET, 'train')) # add the .txt later.

----

## Extract Datasets of Different Sample Sizes

### clean and examine dataset
Examine dataset, create dataframe and clean it from empty / NaN rows, examine the resulting dataframe.

In [3]:
!wc -l {DATASET_TARGET}
!wc -l {DATASET_IDS_TARGET}

568331 /home/jovyan/shared/lena/dataset/dataset.txt
568331 /home/jovyan/shared/lena/dataset/dataset.ids


In [4]:
!shuf -n 5 {DATASET_TARGET}
!shuf -n 5 {DATASET_IDS_TARGET}

_STARTSTACK_ event_whenflagclicked _NEXT_ control_forever _STARTNEST_ looks_nextcostume _NEXT_ control_wait _STARTINPUT_ numtext_input _ENDINPUT_ _ENDNEST_ _ENDSTACK_ _STARTSTACK_ event_whenflagclicked _NEXT_ control_forever _STARTNEST_ sound_playuntildone _MENU_ menu_option _MENU_ _ENDNEST_ _ENDSTACK_ _STARTSTACK_ event_whenflagclicked _NEXT_ control_forever _STARTNEST_ motion_turnright _STARTINPUT_ numtext_input _ENDINPUT_ _ENDNEST_ _ENDSTACK_ _STARTSTACK_ event_whenflagclicked _NEXT_ control_forever _STARTNEST_ motion_gotoxy _STARTINPUT_ operator_random _STARTINPUT_ numtext_input _ENDINPUT_ _STARTINPUT_ numtext_input _ENDINPUT_ _ENDINPUT_ _STARTINPUT_ operator_random _STARTINPUT_ numtext_input _ENDINPUT_ _STARTINPUT_ numtext_input _ENDINPUT_ _ENDINPUT_ _ENDNEST_ _ENDSTACK_ _STARTSTACK_ event_whenflagclicked _NEXT_ control_forever _STARTNEST_ pen_penDown _NEXT_ pen_stamp _ENDNEST_ _ENDSTACK_
_STARTSTACK_ event_whenbackdropswitchesto _MENU_ menu_option _MENU_ _NEXT_ data_setvariablet

53132408
287403141
2741557
80671956
125204462


In [6]:
# read in the text file into a dataframe
def to_df(filepath, columns=[]):
    df = pd.read_csv(filepath, sep="\n", header=None)
    df.columns = columns
    return df

projects_df = to_df(DATASET_TARGET, columns=['project_text'])
ids_df = to_df(DATASET_IDS_TARGET, columns=['project_id'])

In [7]:
print(projects_df.sample(n=5))
print("\n")
print(ids_df.sample(n=5))

                                             project_text
385489  _STARTSTACK_ event_whenkeypressed _MENU_ menu_...
462081  _STARTSTACK_ event_whenflagclicked _NEXT_ data...
98964   _STARTSTACK_ event_whenflagclicked _NEXT_ cont...
100842  _STARTSTACK_ event_whenkeypressed _MENU_ menu_...
402207  _STARTSTACK_ event_whenflagclicked _NEXT_ pen_...


        project_id
523626   219997328
148849    43987840
186126   117514420
378562   119737070
278935    28267108


In [8]:
np.where(projects_df.isnull())
np.where(ids_df.isnull())

(array([], dtype=int64), array([], dtype=int64))

### 1000 samples

In [29]:
num_samples = 1000

In [30]:
def extract_samples(dataframe, n=1000, extension='.txt', column='project_text'):
    # Slice the dataframe to get the top n samples
    df = dataframe.iloc[:n]
    print(df.shape)
    
    # Write the dataframe back into a text file
    target = TRAIN_TARGET + "_" + str(n) + extension
    f = open(target, 'w+')
    for i in df.index:
        f.write(str(df.iloc[i][column]) + "\n")
    
    # Return the text file handle
    return target

In [31]:
train_target = extract_samples(projects_df, n=num_samples, extension='.txt', column='project_text')
print(train_target)

train_ids_target = extract_samples(ids_df, n=num_samples, extension='.ids', column='project_id')
print(train_ids_target)

(1000, 1)
/home/jovyan/shared/lena/dataset/train_1000.txt
(1000, 1)
/home/jovyan/shared/lena/dataset/train_1000.ids


In [32]:
!wc -l {train_target}
!wc -l {train_ids_target}

1000 /home/jovyan/shared/lena/dataset/train_1000.txt
1000 /home/jovyan/shared/lena/dataset/train_1000.ids


### 10,000 samples

In [33]:
num_samples = 10000

In [34]:
train_target = extract_samples(projects_df, n=num_samples, extension='.txt', column='project_text')
print(train_target)

train_ids_target = extract_samples(ids_df, n=num_samples, extension='.ids', column='project_id')
print(train_ids_target)

(10000, 1)
/home/jovyan/shared/lena/dataset/train_10000.txt
(10000, 1)
/home/jovyan/shared/lena/dataset/train_10000.ids


In [35]:
!wc -l {train_target}
!wc -l {train_ids_target}

10000 /home/jovyan/shared/lena/dataset/train_10000.txt
10000 /home/jovyan/shared/lena/dataset/train_10000.ids


### 100,000 samples

In [36]:
num_samples = 100000

In [37]:
train_target = extract_samples(projects_df, n=num_samples, extension='.txt', column='project_text')
print(train_target)

train_ids_target = extract_samples(ids_df, n=num_samples, extension='.ids', column='project_id')
print(train_ids_target)

(100000, 1)
/home/jovyan/shared/lena/dataset/train_100000.txt
(100000, 1)
/home/jovyan/shared/lena/dataset/train_100000.ids


In [38]:
!wc -l {train_target}
!wc -l {train_ids_target}

100000 /home/jovyan/shared/lena/dataset/train_100000.txt
100000 /home/jovyan/shared/lena/dataset/train_100000.ids


### 500,000 samples

In [39]:
num_samples = 500000

In [40]:
train_target = extract_samples(projects_df, n=num_samples, extension='.txt', column='project_text')
print(train_target)

train_ids_target = extract_samples(ids_df, n=num_samples, extension='.ids', column='project_id')
print(train_ids_target)

(500000, 1)
/home/jovyan/shared/lena/dataset/train_500000.txt
(500000, 1)
/home/jovyan/shared/lena/dataset/train_500000.ids


In [41]:
!wc -l {train_target}
!wc -l {train_ids_target}

500000 /home/jovyan/shared/lena/dataset/train_500000.txt
500000 /home/jovyan/shared/lena/dataset/train_500000.ids
