# PreProcessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import required library
import pprint
import os
import re
import json
import gzip
import logging
import nltk
import ast
import json
import string
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import pickle
from smart_open import open, register_compressor
from sklearn.feature_extraction.text import TfidfVectorizer
# import pyterrier as pt

# nltk.download('wordnet')
nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

DATASET_PATH = "/content/drive/Shareddrives/IR project _ GRAS/dataset/gu_GujaratSamachar.tgz"
TOPICS_PATH = "/content/drive/Shareddrives/IR project _ GRAS/dataset/gu.topics.126-175.2011.txt"
QRELS_PATH = "/content/drive/Shareddrives/IR project _ GRAS/dataset/gu.qrels.126-175.2011.txt"
TOKEN_CSV_PATH = "/content/drive/Shareddrives/IR project _ GRAS/dataset/guj_corpus_processed.csv"
SAVE_PATH = "/content/drive/Shareddrives/IR project _ GRAS/dataset"

# The Gujarati stopwords are taken from
# https://github.com/gujarati-ir/Gujarati-Stop-Words and modified by Chandrakant Bhogayata and by us also.
with open("/content/drive/Shareddrives/IR project _ GRAS/dataset/Gujarati.Stop.Words.txt") as stopw_file:
    guj_stopwords = [word.strip() for word in stopw_file.readlines()]
    guj_stopwords[0] = guj_stopwords[0].lstrip("\ufeff")

print(guj_stopwords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['અથવા', 'અને', 'અમને', 'અમારું', 'અમે', 'અહીં', 'આ', 'આગળ', 'આથી', 'આનું', 'આને', 'આપણને', 'આપણું', 'આપણે', 'આપી', 'આવી', 'આવે', 'ઉપર', 'ઊંચે', 'ઉભા', 'એ', 'એક', 'એના', 'એનાં', 'એની', 'એનું', 'એને', 'એનો', 'એમ', 'એવા', 'એવાં', 'એવી', 'એવું', 'એવો', 'ઓછું', 'અંગે', 'અંદર', 'કઈ', 'કયું', 'કયો', 'કરવું', 'કરતાં', 'કરી', 'કરીએ', 'કરું', 'કરે', 'કરેલું', 'કર્યા', 'કર્યાં', 'કર્યું', 'કર્યો', 'કંઈક', 'કાંઈ', 'કે', 'કેટલું', 'કેમ', 'કેવી', 'કેવું', 'કોઈ', 'કોઈક', 'કોણ', 'કોણે', 'કોને', 'ક્યારે', 'ક્યાં', 'ખૂબ', 'ગઈ', 'ગયા', 'ગયાં', 'ગયું', 'ગયો', 'ઘણું', 'છ', 'છતાં', 'છીએ', 'છું', 'છે', 'છેક', 'છો', 'જ', 'જાય', 'જી', 'જે', 'જેટલું', 'જેને', 'જેમ', 'જેવી', 'જેવું', 'જેવો', 'જો', 'જોઈએ', 'જ્યારે', 'જ્યાં', 'ઝાઝું', 'તને', 'તમને', 'તમારું', 'તમે', 'તારાથી', 'તારામાં', 'તારું', 'તું', 'તે', 'તેઓ', 'તેથી', 'તેણે', 'તેના', 'તેની', 'તેનું', 'તેને', 'તેમ', 'તેમનું', 'તેમને', 'તેવી', 'તેવું', 'તે

In [None]:
sentence  = "સંબંધિત દસ્તાવેજો માં 51 Abcd સ્વાઈન ફ્લૂ માટે* ની ૩૪ સ્વદેશી-રસી બનાવા ની.રીત, મનુષ્ય અને પ્રાણીયો ઉપર રસી નો ઉપયોગ, રસી ની અચત દુર કરવા માટે એ જગ્યા પરની વ્યવસ્થાઓ અને લોકોના જીવન બચાવ માટે રસી ની ભૂમિકા વિશે ની માહિતી હોવી જોઇયે."

tokenizer = nltk.RegexpTokenizer('[^*.-A-Za-z0-9૦૧૨૩૪૫૬૭૮૯\s]+')
words = tokenizer.tokenize(sentence)
# words = nltk.word_tokenize(sentence)
new_words = [word for word in words 
             if word not in list(string.punctuation) 
            #  and not word.isnumeric()
             and word not in guj_stopwords]

print(new_words)

['સંબંધિત', 'દસ્તાવેજો', 'સ્વાઈન', 'ફ્લૂ', 'સ્વદેશી', 'રસી', 'બનાવા', 'રીત,', 'મનુષ્ય', 'પ્રાણીયો', 'રસી', 'નો', 'ઉપયોગ,', 'રસી', 'અચત', 'દુર', 'કરવા', 'જગ્યા', 'પરની', 'વ્યવસ્થાઓ', 'લોકોના', 'જીવન', 'બચાવ', 'રસી', 'ભૂમિકા', 'વિશે', 'માહિતી', 'હોવી', 'જોઇયે']


In [None]:
class GujaratiFIRECorpus:
    
    def __init__(self, path, **kwargs):
        self._docno_list = []
        self._tokens_list = []
        self._dataset_path = path
        self._process_corpus(**kwargs)
    
    @property
    def docno_list(self):
        return self._docno_list

    @property
    def tokens_list(self):
        return self._tokens_list

    def _handle_tgz(self, fileobj, mode):
        return gzip.GzipFile(fileobj=fileobj, mode=mode)

    def _preprocess_guj_string(self, text):
        tokenizer = nltk.RegexpTokenizer('[^*.-A-Za-z0-9૦૧૨૩૪૫૬૭૮૯\s]+')
        tokens = tokenizer.tokenize(text)
        new_tokens = [re.sub("[()',‘’A-Za-z]", "", token) for token in tokens 
                      if token not in string.punctuation 
                      and token not in guj_stopwords]
        return new_tokens

    def _process_corpus(self, save_processed_corpus=False, save_path=None):
        register_compressor('.tgz', self._handle_tgz)

        self._docno_list = []
        self._tokens_list = []
        text_str = ""
        text_flag = 0
        print_flag = 0

        if save_processed_corpus:
            if save_path == None:
                raise ValueError("Please provide a folder path to save the processed corpus.")
            
            import csv
            save_file = open(os.path.join(save_path, 'guj_corpus_processed.csv'), 'w')
            csv_writer = csv.writer(save_file, delimiter=',')
            csv_writer.writerow(['docno', 'tokens'])

        pbar = tqdm(total=313163, desc="Preprocessing")
        for line in open(self._dataset_path, encoding='utf-8'):
            if print_flag and len(self._docno_list) != 0 and len(self._docno_list) % 10000 == 0:
                print(f"Processed {len(self._docno_list)} documents.")
                print_flag = 0
            
            if line.startswith('<DOCNO>'):
                docno = line.replace('<DOCNO>', '').replace('</DOCNO>', '').strip()
                self._docno_list.append(docno)
                print_flag = 1
                continue

            if line.startswith('<TEXT>'):
                text_flag = 1
                continue
            
            if text_flag:
                if line.startswith('</TEXT>'):
                    text_flag = 0
                    text_tokens = self._preprocess_guj_string(text_str)
                    
                    self._tokens_list.append(text_tokens)

                    if save_processed_corpus:
                        csv_writer.writerow([docno, str(text_tokens)])
                    
                    pbar.update(1)
                    text_tokens = []
                    text_str = ""
                    continue
                
                text_str += " " + line.strip()
        
        pbar.close()
        if save_processed_corpus: save_file.close()
        print("Corpus preprocessed.")


In [None]:
guj_corpus = GujaratiFIRECorpus(
    DATASET_PATH, 
    save_processed_corpus=True, 
    save_path="/content/drive/Shareddrives/IR project _ GRAS/dataset")

HBox(children=(FloatProgress(value=0.0, description='Preprocessing', max=313163.0, style=ProgressStyle(descrip…

Processed 10000 documents.
Processed 20000 documents.
Processed 30000 documents.
Processed 40000 documents.
Processed 50000 documents.
Processed 60000 documents.
Processed 70000 documents.
Processed 80000 documents.
Processed 90000 documents.
Processed 100000 documents.
Processed 110000 documents.
Processed 120000 documents.
Processed 130000 documents.
Processed 140000 documents.
Processed 150000 documents.
Processed 160000 documents.
Processed 170000 documents.
Processed 180000 documents.
Processed 190000 documents.
Processed 200000 documents.
Processed 210000 documents.
Processed 220000 documents.
Processed 230000 documents.
Processed 240000 documents.
Processed 250000 documents.
Processed 260000 documents.
Processed 270000 documents.
Processed 280000 documents.
Processed 290000 documents.
Processed 300000 documents.
Processed 310000 documents.

Corpus preprocessed.


In [None]:
# RAM gets full
del guj_corpus

In [None]:
#path of zip file
class Preprocessing:
    def __init__(self,csv_file_path,token):
        self.unique_token=token
        self.csv_file_path=csv_file_path

    def fit(self):
        print("Loading data...")  
        dataset=pd.read_csv(self.csv_file_path)
        print("Data loaded successfully.")
        
        for tokens in tqdm(dataset['tokens']):
            tokens=ast.literal_eval(tokens)
            for token in tokens:
                process_token = re.sub("[()',‘’A-Za-z]", "", token)
                if len(process_token) != 0:
                    if process_token not in self.unique_token:
                        self.unique_token.add(process_token)
            if len(self.unique_token) % 10000 == 0:
                print(f"{len(self.unique_token)} tokens found so far")

In [None]:
preprocessing_obj=Preprocessing(TOKEN_CSV_PATH,set())
preprocessing_obj.fit()
pickle.dump( preprocessing_obj.unique_token, open(os.path.join(SAVE_PATH,"unique_token"), "wb" ) )

Loading data...
Data loaded successfully.


HBox(children=(FloatProgress(value=0.0, max=313163.0), HTML(value='')))

90000 tokens found so far
190000 tokens found so far
260000 tokens found so far
380000 tokens found so far
510000 tokens found so far
520000 tokens found so far
630000 tokens found so far
630000 tokens found so far
640000 tokens found so far
680000 tokens found so far
800000 tokens found so far
800000 tokens found so far
850000 tokens found so far
850000 tokens found so far
910000 tokens found so far
970000 tokens found so far
1010000 tokens found so far
1020000 tokens found so far
1090000 tokens found so far
1210000 tokens found so far
1210000 tokens found so far
1340000 tokens found so far
1400000 tokens found so far
1420000 tokens found so far
1520000 tokens found so far
1520000 tokens found so far
1570000 tokens found so far
1590000 tokens found so far
1640000 tokens found so far
1670000 tokens found so far
1740000 tokens found so far
1780000 tokens found so far
1810000 tokens found so far
1820000 tokens found so far
1830000 tokens found so far
1870000 tokens found so far
1890000 t