# Download Data from URL

In [1]:
!curl -Lo /content/goodnotes_submission.csv https://lp-prod-resources.s3.amazonaws.com/757/76346/2021-10-12-00-20-56/goodnotes_submission.csv
!curl -Lo /content/competitors_reddit.csv https://lp-prod-resources.s3.amazonaws.com/757/76346/2021-10-12-00-21-08/competitors_reddit.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1585k  100 1585k    0     0  4039k      0 --:--:-- --:--:-- --:--:-- 4033k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4537k  100 4537k    0     0  7839k      0 --:--:-- --:--:-- --:--:-- 7850k


# Import dependencies

In [2]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split

# Data Preprocessing

In [3]:
# read csv data
data = pd.read_csv("/content/goodnotes_submission.csv")
data.head(1)

Unnamed: 0,submission_id,submission_title,submission_selftext,submission_link_flair_text,reply_body,all_text
0,aglcrj,Goodnotes 4 vs. Goodnotes 5 right now,I have used Goodnotes 4 for work a ton. And I...,,"[""I'm getting a ton of bugs with 5 as well (sn...",Goodnotes 4 vs. Goodnotes 5 right nowI have us...


**What is literal_eval in Python?**

literal_eval() method is a useful tool for processing data that is stored in a string format

In [4]:
reply_body_str_list = [ast.literal_eval(sent) for sent in data["reply_body"]]

Create a new column by combining (submission_title, submission_selftext and first 10 values of reply_body)

In [5]:
data["all_text"] = data["submission_title"] + " " + data["submission_selftext"] + " " + data["reply_body"].str.split("\n").str[:10].str.join(" ")

In [6]:
data.shape

(679, 6)

## Filter data

**unlabelled data** has submission_link_flair_text that has nan values

**labelled data** has submission_link_flair_text with no null values

In [7]:
unlabelled_df = data[data['submission_link_flair_text'].isna() | (data['submission_link_flair_text'] == '')]
unlabelled_df.shape

(438, 6)

In [8]:
labelled_df = data[data['submission_link_flair_text'].notna() & (data['submission_link_flair_text'] != '')]
labelled_df.shape

(241, 6)

### Check unique values for both labelled and unlabelled data

In [9]:
unlabelled_df["submission_link_flair_text"].unique()

array([nan], dtype=object)

In [10]:
labelled_df["submission_link_flair_text"].unique()

array(['Review', 'Question - iPad', 'Question - Other', 'Question - Mac',
       'Stylus problems', 'Templates', 'Question - iPhone'], dtype=object)

In [11]:
labelled_df.head()

Unnamed: 0,submission_id,submission_title,submission_selftext,submission_link_flair_text,reply_body,all_text
2,agprta,Finally... GoodNotes 5! Did I miss anything in...,,Review,['Man i imagined you would be a lot more popul...,
22,b27oag,I haven't had GN4 and I‘m wondering if I shoul...,Hey there! I've been hearing a lot about GoodN...,Question - iPad,"[""Probably a better option to go with Notabili...",I haven't had GN4 and I‘m wondering if I shoul...
27,ba4xz3,Colors,Is there a way to find the color used? Meaning...,Question - Other,['Can be done if your have an app like Artstud...,Colors Is there a way to find the color used? ...
30,bjgb96,How do I make detailed outlines in GN5?,"I know how to add pages to outlines, but I’m w...",Question - iPad,"['I hope you get a reply, I’m wondering the sa...",How do I make detailed outlines in GN5? I know...
34,byrrfc,Writing to text conversion?,Wondering if there is any other method of conv...,Question - Other,"['He’s right though, mando content is a sequel...",Writing to text conversion? Wondering if there...


### Change unique values

see values in submission_link_flair_text that has Question in it

In [12]:
labelled_df[labelled_df["submission_link_flair_text"].str.contains("Question")]["submission_link_flair_text"]

22      Question - iPad
27     Question - Other
30      Question - iPad
34     Question - Other
35     Question - Other
             ...       
670    Question - Other
673     Question - iPad
675     Question - iPad
677     Question - iPad
678    Question - Other
Name: submission_link_flair_text, Length: 180, dtype: object

In [13]:
question_x = labelled_df[labelled_df["submission_link_flair_text"].str.contains("Question")]["submission_link_flair_text"].unique()
# can use startswith() or contains() both are same

In [14]:
labelled_df[labelled_df["submission_link_flair_text"].str.startswith("Question")]

Unnamed: 0,submission_id,submission_title,submission_selftext,submission_link_flair_text,reply_body,all_text
22,b27oag,I haven't had GN4 and I‘m wondering if I shoul...,Hey there! I've been hearing a lot about GoodN...,Question - iPad,"[""Probably a better option to go with Notabili...",I haven't had GN4 and I‘m wondering if I shoul...
27,ba4xz3,Colors,Is there a way to find the color used? Meaning...,Question - Other,['Can be done if your have an app like Artstud...,Colors Is there a way to find the color used? ...
30,bjgb96,How do I make detailed outlines in GN5?,"I know how to add pages to outlines, but I’m w...",Question - iPad,"['I hope you get a reply, I’m wondering the sa...",How do I make detailed outlines in GN5? I know...
34,byrrfc,Writing to text conversion?,Wondering if there is any other method of conv...,Question - Other,"['He’s right though, mando content is a sequel...",Writing to text conversion? Wondering if there...
35,byyigs,Digital Notebook,Trying to make a good digital notebook that I ...,Question - Other,['I think you are correct. It works wonderfull...,Digital Notebook Trying to make a good digital...
...,...,...,...,...,...,...
670,o4tiv2,Question about iCloud backups,Does goodnotes backup your element stickers an...,Question - Other,['Everything added to the elements are saved i...,Question about iCloud backups Does goodnotes b...
673,o4y0ss,iPad stylus rubber tip,Do you know a good rubber tip for the ipad pen...,Question - iPad,"[""From the pack I bought, one tip has lasted a...",iPad stylus rubber tip Do you know a good rubb...
675,o87r7d,"GoodNotes on iPadOS 15 Dev Beta 2, is it worki...",Really wanna try out the new features brought ...,Question - iPad,"[""I updated to beta 2 yesterday and have been ...","GoodNotes on iPadOS 15 Dev Beta 2, is it worki..."
677,o8c9m0,Is it possible to insert blank pages OF THE SA...,I recently tried Notability for a short while ...,Question - iPad,['Something I also wish to know. I usually put...,Is it possible to insert blank pages OF THE SA...


replace all values that starts with Question into Question only in **submission_link_flair_text** column

In [15]:
labelled_df["submission_link_flair_text"].replace(to_replace = question_x, value = "Question", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labelled_df["submission_link_flair_text"].replace(to_replace = question_x, value = "Question", inplace = True)


check to see results

In [16]:
labelled_df["submission_link_flair_text"].unique()

array(['Review', 'Question', 'Stylus problems', 'Templates'], dtype=object)

In [17]:
data.head(1)

Unnamed: 0,submission_id,submission_title,submission_selftext,submission_link_flair_text,reply_body,all_text
0,aglcrj,Goodnotes 4 vs. Goodnotes 5 right now,I have used Goodnotes 4 for work a ton. And I...,,"[""I'm getting a ton of bugs with 5 as well (sn...",Goodnotes 4 vs. Goodnotes 5 right now I have u...


In [18]:
print(len(labelled_df["all_text"]), len(labelled_df["submission_link_flair_text"]))

241 241


Convert into list to prepare data for model training

In [19]:
X = list(labelled_df["all_text"])
Y = list(labelled_df["submission_link_flair_text"])
train_tag, val_tag, train_tag_label, val_tag_label = train_test_split(X, Y, test_size = 0.2, stratify=Y)

In [20]:
print(len(train_tag), len(train_tag_label))

192 192


In [21]:
category_train = ['tag classification']*len(train_tag_label)
# category_train

['tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag class

In [22]:
category_val = ['tag classification']*len(val_tag_label)
# category_val

['tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag classification',
 'tag class

In [23]:
print(len(category_train), len(train_tag), len(train_tag_label))

192 192 192


This is the format :

**prefix :**  A string indicating the task to perform.

**input_text :** The input text sequence. prefix is automatically prepended to form the full input.

**target_text :** The target sequence

In [24]:
train_tag_df = pd.DataFrame({
    "prefix" : category_train,
    "input_text" : train_tag,
    "target_text" : train_tag_label
})

In [25]:
val_tag_df = pd.DataFrame({
    "prefix" : category_val,
    "input_text" : val_tag,
    "target_text" : val_tag_label
})

Convert data into CSV

In [26]:
train_tag_df.to_csv("/content/singletask_noupsampling_train.csv", index=False)
val_tag_df.to_csv("/content/singlatask_noupsampling_val.csv", index=False)
unlabelled_df.to_csv("/content/unlabelled_df.csv", index=False)

In [27]:
training_data = pd.read_csv("/content/singletask_noupsampling_train.csv")
training_data.head()

Unnamed: 0,prefix,input_text,target_text
0,tag classification,,Review
1,tag classification,,Review
2,tag classification,,Question
3,tag classification,,Question
4,tag classification,Added an A3 template but it’s added as A4 in m...,Question


In [28]:
validation_data = pd.read_csv("/content/singlatask_noupsampling_val.csv")
validation_data.head()

Unnamed: 0,prefix,input_text,target_text
0,tag classification,,Question
1,tag classification,"handwriting recognition Dumb question, but doe...",Question
2,tag classification,,Review
3,tag classification,How can I avoid excessive margins when importi...,Question
4,tag classification,,Question


In [29]:
unlabel_data = pd.read_csv("/content/unlabelled_df.csv")
unlabel_data.head()

Unnamed: 0,submission_id,submission_title,submission_selftext,submission_link_flair_text,reply_body,all_text
0,aglcrj,Goodnotes 4 vs. Goodnotes 5 right now,I have used Goodnotes 4 for work a ton. And I...,,"[""I'm getting a ton of bugs with 5 as well (sn...",Goodnotes 4 vs. Goodnotes 5 right now I have u...
1,agoowm,The bundle is available !,,,['Thank you. I have been waiting!\n\n&amp;#x20...,
2,agpzxb,What happened to the pen (Goodnotes 5)?,I just got Goodnotes 5 and I was so excited fo...,,"[""Have you tried the ball pen? That was the cl...",What happened to the pen (Goodnotes 5)? I just...
3,agq8qv,Non Apple Pencil styluses on GOodnotes 5?,I've been using a Wacom Bamboo stylus with Goo...,,['According to the [review at Macstories](http...,Non Apple Pencil styluses on GOodnotes 5? I've...
4,agqksi,Text/typing in Goodnotes 5,"Notability user here, but trying out Goodnotes...",,"[""That's neat af"", 'Seattle, hopefully not too...",Text/typing in Goodnotes 5 Notability user her...


In [30]:
training_data["target_text"].value_counts()

Question           143
Templates           31
Review              14
Stylus problems      4
Name: target_text, dtype: int64

# Upsampling

In [31]:
from sklearn.utils import resample

convert labels into list for preprocessing

In [32]:
tags = list(training_data["target_text"].unique())
tags

['Review', 'Question', 'Templates', 'Stylus problems']

This function takes dataframe as input along with labels and n means how many times to upsample particular label.

In [33]:
def upsampling(training_data, tag, n):
  selected_rows =  training_data[training_data["target_text"] == tag]
  update_df = resample(selected_rows, replace=True, n_samples=len(selected_rows)*n, random_state=42)
  data = pd.concat([training_data, update_df])
  return data

In [34]:
training_data = upsampling(training_data, "Review", 3)
training_data = upsampling(training_data, "Stylus problems", 3)
training_data = upsampling(training_data, "Templates", 2)

In [35]:
training_data["target_text"].value_counts()

Question           143
Templates           93
Review              56
Stylus problems     16
Name: target_text, dtype: int64

In [36]:
training_data.to_csv("/content/singletask_train.csv", index=False)

In [37]:
trained_data = pd.read_csv("/content/singletask_train.csv")
trained_data["target_text"].value_counts()

Question           143
Templates           93
Review              56
Stylus problems     16
Name: target_text, dtype: int64