In [1]:
import pandas as pd

## Read dataset

In [2]:
df = pd.read_csv('../../WikiHow-Dataset/wikihowAll.csv', delimiter=',')

In [3]:
df.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


In [4]:
df['summary'] = df['title'].str.cat(df['headline'], sep='')
df=df.dropna()

In [5]:
prepare_df = pd.DataFrame()
prepare_df['summary'] = df['summary']
prepare_df['text'] = df['text']

prepare_df.head()

Unnamed: 0,summary,text
0,How to Be an Organized Artist1\nKeep related s...,"If you're a photographer, keep all the necess..."
1,How to Create a Neopoprealist Art Work\nCreate...,See the image for how this drawing develops s...
2,How to Be a Visual Effects Artist1\nGet a bach...,It is possible to become a VFX artist without...
3,How to Become an Art Investor\nStart with some...,The best art investors do their research on t...
4,How to Be an Organized Artist2\nKeep your refe...,"As you start planning for a project or work, ..."


## Clean the data

In [6]:
import re

# Remove non-alphabetic characters (Data Cleaning)
def text_strip(column):
    for row in column:
        row = re.sub("(\\t)", " ", str(row)).lower()
        row = re.sub("(\\r)", " ", str(row)).lower()
        row = re.sub("(\\n)", " ", str(row)).lower()

        # Remove _ if it occurs more than one time consecutively
        row = re.sub("(__+)", " ", str(row)).lower()

        # Remove - if it occurs more than one time consecutively
        row = re.sub("(--+)", " ", str(row)).lower()

        # Remove ~ if it occurs more than one time consecutively
        row = re.sub("(~~+)", " ", str(row)).lower()

        # Remove + if it occurs more than one time consecutively
        row = re.sub("(\+\++)", " ", str(row)).lower()

        # Remove . if it occurs more than one time consecutively
        row = re.sub("(\.\.+)", " ", str(row)).lower()

        # Remove the characters - <>()|&©ø"',;?~*!
        row = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", " ", str(row)).lower()

        # Remove punctuations at the end of a word
        row = re.sub("(\.\s+)", " ", str(row)).lower()
        row = re.sub("(\-\s+)", " ", str(row)).lower()
        row = re.sub("(\:\s+)", " ", str(row)).lower()

        # Replace any url to only the domain name
        try:
            url = re.search(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", str(row))
            repl_url = url.group(3)
            row = re.sub(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", repl_url, str(row))
        except:
            pass

        # Remove multiple spaces
        row = re.sub("(\s+)", " ", str(row)).lower()

        # Remove the single character hanging between any two spaces
        row = re.sub("(\s+.\s+)", " ", str(row)).lower()

        yield row

In [7]:
processed_text = text_strip(prepare_df['summary'])
processed_summary = text_strip(prepare_df['text'])

In [8]:
prepare_df.head()

Unnamed: 0,summary,text
0,How to Be an Organized Artist1\nKeep related s...,"If you're a photographer, keep all the necess..."
1,How to Create a Neopoprealist Art Work\nCreate...,See the image for how this drawing develops s...
2,How to Be a Visual Effects Artist1\nGet a bach...,It is possible to become a VFX artist without...
3,How to Become an Art Investor\nStart with some...,The best art investors do their research on t...
4,How to Be an Organized Artist2\nKeep your refe...,"As you start planning for a project or work, ..."


In [9]:
import spacy
from time import time

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) 

# Process text as batches and yield Doc objects in order
text = [str(doc) for doc in nlp.pipe(processed_text, batch_size=5000)]

summary = ['_START_ '+ str(doc) + ' _END_' for doc in nlp.pipe(processed_summary, batch_size=5000)]

KeyboardInterrupt: 