In [16]:

import os
import pickle
import pandas as pd
import spacy
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import gc
import torch

def lemmatize(phrase):
    """Return lematized words"""
    spa = spacy.load("en_core_web_sm")
    return " ".join([word.lemma_ for word in spa(phrase)])

def reading_csv(path_to_csv):
    """Return text column in csv"""
    data = pd.read_csv(path_to_csv)
    ctx_paragraph = []
    for txt in data['text']:
        if not pd.isna(txt):
            ctx_paragraph.append(txt)
    return ctx_paragraph

def lemmatizer(paragraphs, domain_lemma_cache, domain_pickle):
    lemma_cache = domain_lemma_cache
    if not os.path.isfile(lemma_cache):
        lemmas = [lemmatize(par) for par in tqdm(paragraphs)]
        df = pd.DataFrame(data={'context': paragraphs, 'lemmas': lemmas})
        df.to_feather(lemma_cache)
        
    if not os.path.isfile(VEC_PICKLE_LOC):
        vectorizer = TfidfVectorizer(
            stop_words='english', min_df=5, max_df=.5, ngram_range=(1, 3))
        pickle.dump(vectorizer, open(VEC_PICKLE_LOC, "wb"))
        
    if not os.path.isfile(domain_pickle):
        tfidf = vectorizer.fit_transform(lemmas)
        pickle.dump(tfidf, open(domain_pickle, "wb"))

        
domains_choices = {
    'auckland':('./files/data/auckland_uni/auckland.csv',
          './files/data/auckland_uni/auckland.feather',
          './files/data/auckland_uni/auckland_tfidf.pickle'),
    'otago':('./files/data/otago_uni/otago.csv',
          './files/data/otago_uni/otago.feather',
          './files/data/otago_uni/otago_tfidf.pickle'),
    'canterbury':('./files/data/canterbury_uni/canterbury.csv',
          './files/data/canterbury_uni/canterbury.feather',
          './files/data/canterbury_uni/canterbury.pickle'),
    'massey':('./files/data/massey_uni/massey.csv',
          './files/data/massey_uni/massey.feather',
          './files/data/massey_uni/massey_tfidf.pickle'),
    'wgtn':('./files/data/wgtn/wgtn.csv',
          './files/data/wgtn/wgtn.feather',
          './files/data/wgtn/wgtn_tfidf.pickle')
}      
        

for i in domains_choices:   
    LEMMA_CACHE = domains_choices[i][1]
    csvpath = domains_choices[i][0]
    pickle_cache = domains_choices[i][2]
    paragraphs = reading_csv(csvpath)
    lemmatizer(paragraphs, LEMMA_CACHE, pickle_cache)

  0%|          | 3/385740 [00:02<89:40:02,  1.19it/s] 

KeyboardInterrupt: 