In [None]:
import pandas as pd
import spacy
import os
import gensim
import pickle
import numpy as np

# DEFINE SETTINGS

SAMPLE_SIZE = 800
NUMBER_OF_CORES = 1
PROCESSED_DF_FILENAME = 'my_processing.pkl'
PHRASER_FILENAME = 'phraser.pkl'
REPORT_EVERY_N = 10

# DEFINE FUNCTIONS

def filter_text(spacy_doc, phraser, stopwords):
    transformed_doc = []
    for sentence in spacy_doc.sents:
        sentence_tokens = [token.lemma_.lower() for token in sentence if token.lemma_.lower().isalpha()]
        transformed = phraser[sentence_tokens]
        transformed_doc.extend(transformed)
    tokens = [token for token in transformed_doc if token.lower() not in stopwords]
    return tokens

def train_phraser(spacy_generator, stopwords):
    sentences = [
        [token.lemma_.lower() for token in sentence if token.lemma_.lower().isalpha()]
        for doc in spacy_generator 
        for sentence in doc.sents]
    
    bigram_phraser = gensim.models.Phrases(sentences, common_terms=stopwords)
    return bigram_phraser

# LOAD SPACY

nlp = spacy.load('en_core_web_md')
stopwords = nlp.Defaults.stop_words

# LOAD IN DATA - CHECKING FOR PICKLE

if os.path.exists(PROCESSED_DF_FILENAME):
    df7 = pd.read_pickle(PROCESSED_DF_FILENAME)
else:
    filename = 'df6.csv'
    df7 = pd.read_csv(filename)
    df7['cleaned'] = ""

# LOAD IN OR TRAIN AND SAVE PHRASER

if os.path.exists(PHRASER_FILENAME):
    with open(PHRASER_FILENAME,'rb') as f:
        phraser = pickle.load(f)
else:
    print('Training Phraser')
    docs = nlp.pipe(df7['Body'], n_process=NUMBER_OF_CORES)
    phraser = train_phraser(docs, stopwords)
    with open(PHRASER_FILENAME,'wb') as f:
        pickle.dump(phraser,f)

# SAMPLE THE DATASET

to_process_filter = df7['cleaned'].apply(len) == 0
todays_sample = df7[to_process_filter].head(SAMPLE_SIZE)

# PROCESS THE SAMPLE

docs = nlp.pipe(todays_sample['Body'], n_process=NUMBER_OF_CORES)
cleaned_docs = []
for i, spacy_doc in enumerate(docs, start=1):
    if i % REPORT_EVERY_N == 0:
        print(i, "documents processed")
    cleaned = filter_text(spacy_doc, phraser, stopwords)
    cleaned_docs.append(cleaned)

# INSERT PROCESSED DOCS INTO DATAFRAME

check_filter = df7['cleaned'].apply(len) != 0
df7.loc[todays_sample.index,'cleaned'] = np.array(cleaned_docs, dtype='object')
number_of_processed_rows = len(df7[df7['cleaned'].apply(len) !=0])

# SAVE TO DISK

df7.to_pickle(PROCESSED_DF_FILENAME)
print(f"Processed total: {number_of_processed_rows}/{len(df7)}")