### Imports

In [1]:
import re
import os
import pickle
import pandas as pd

from tqdm import tqdm

from nltk import download
from nltk.corpus import stopwords, gutenberg
from nltk.stem import WordNetLemmatizer

In [2]:
# download('gutenberg')
# download('stopwords')
# download('punkt')

stop_words = set(stopwords.words('english'))
stop_words.remove("very")
stop_words.add("th")
stop_words.add("[")
stop_words.add("]")

tqdm.pandas()
lemmatizer = WordNetLemmatizer()

In [3]:
DATA_DIR = './../data/pickle1/'
OUTPUT_DIR = './../data/processed/'

### Functions

In [4]:
def remove_special_chars(text):
    """
    This function returns a string with filtered special case characters
    """
    return re.sub('[^A-Za-z0-9 ]+', '', text)

def remove_n(text):
    """
    This function returns a string after removing new line char(\n) from a string
    """
    clean1 = re.compile('\n')
    return re.sub(clean1,'', text)

def remove_numbers(text):
    """
    This function returns a string with filtered numbers
    """
    return re.sub('[^A-Za-z ]+', '', text)

### Fetch Data

In [5]:
sentences = []
for file in os.listdir(DATA_DIR):
    if(file.endswith('.pickle')):
#         if(file!='gutenberg_sentences_large.pickle'):
        with open(DATA_DIR + file, 'rb') as f:
            data = pickle.load(f)
        print("sentences: " + str(len(data)))
        sentences += data
        
df = pd.DataFrame(sentences, columns=['sentences'])

sentences: 9930
sentences: 716
sentences: 2039
sentences: 57822
sentences: 283
sentences: 98552


In [6]:
len(sentences)

169342

### Convert to lower case

In [7]:
df['sentences'] = df['sentences'].apply(lambda x : x.lower())

### Removing Stopwords

In [8]:
df['sentences'] = df['sentences'].progress_apply(lambda sent : ' '.join([word for word in sent.split() if word not in stop_words]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169342/169342 [00:00<00:00, 406922.73it/s]


### Lemmatization

In [86]:
df['sentences'] = df['sentences'].progress_apply(lambda sent: ' '.join([lemmatizer.lemmatize(word) for word in sent.split()]))

100%|████████████████████████████████| 169342/169342 [00:04<00:00, 35175.83it/s]


### Special Characters Removal

In [9]:
df['sentences'] = df['sentences'].progress_apply(lambda sent: remove_special_chars(sent))
df['sentences'] = df['sentences'].progress_apply(lambda sent: remove_n(sent))
df['sentences'] = df['sentences'].progress_apply(lambda sent: remove_numbers(sent))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169342/169342 [00:00<00:00, 460557.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169342/169342 [00:00<00:00, 838065.01it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169342/169342 [00:00<00:00, 605983.31it/s]


### words with length above threshold removal

In [10]:
df['sentences'] = df['sentences'].progress_apply(lambda x: ' '.join([item for item in x.split() if len(item)<25]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169342/169342 [00:00<00:00, 661717.02it/s]


In [11]:
df = df[df['sentences']!=''].reset_index(drop=True)

### Save Preprocessed Data

In [19]:
df.to_csv(OUTPUT_DIR + 'processed_data_small_lemma.csv',index=False)

# Exploratory Data Analysis

In [185]:
df['sentences'] = df['sentences'].apply(lambda x : x.split())

from collections import Counter
text_counter = Counter()
_ = df['sentences'].apply(text_counter.update)

vocab_old = len(text_counter)
words_old = str(sum(text_counter.values()))

print("No. of unique words present in old processed train data = ",vocab_old)
print("Total no of words : " + words_old)

No. of unique words present in old processed train data =  60709
Total no of words : 1385629


### Current Vocab size : 60757

##### Removing Redundant words which act as stopwords

In [127]:
text_counter.most_common()

[('shall', 11693),
 ('said', 9474),
 ('unto', 9010),
 ('lord', 8692),
 ('one', 7342),
 ('thou', 6759),
 ('retrieved', 5959),
 ('god', 5690),
 ('man', 5675),
 ('thy', 5609),
 ('day', 4813),
 ('thee', 4807),
 ('ye', 4677),
 ('upon', 4606),
 ('mr', 4217),
 ('would', 4191),
 ('very', 4155),
 ('come', 4053),
 ('archived', 3995),
 ('son', 3986),
 ('thing', 3974),
 ('may', 3882),
 ('original', 3846),
 ('king', 3785),
 ('city', 3772),
 ('time', 3709),
 ('could', 3697),
 ('like', 3685),
 ('hand', 3480),
 ('came', 3457),
 ('also', 3411),
 ('people', 3406),
 ('house', 3342),
 ('know', 3301),
 ('great', 3145),
 ('little', 3114),
 ('go', 3108),
 ('see', 3102),
 ('say', 3084),
 ('good', 3048),
 ('world', 3015),
 ('every', 3005),
 ('made', 2953),
 ('u', 2945),
 ('father', 2927),
 ('child', 2897),
 ('well', 2807),
 ('first', 2794),
 ('men', 2789),
 ('two', 2788),
 ('let', 2701),
 ('israel', 2632),
 ('make', 2568),
 ('even', 2566),
 ('land', 2555),
 ('hath', 2535),
 ('much', 2483),
 ('word', 2455),
 ('

In [50]:
df['sentences'] = df['sentences'].apply(lambda x : x.split())

from collections import Counter
text_counter = Counter()
_ = df['sentences'].apply(text_counter.update)


# print("No. of unique words present in old processed train data = ",vocab_old)
# print("Total no of words : " + words_old)

print("No. of unique words present in processed train data = ",len(text_counter))
print("Total no of words : " + str(sum(text_counter.values())))

No. of unique words present in processed train data =  61525
Total no of words : 1384113


In [12]:
remove_words = ['shall', 'ye', 'mr', 'u', 'shalt', 'pdf', 'hast', 'b', 'th', 'oh', 'in', 'de', 'c', 'le']

df['sentences'] = df['sentences'].progress_apply(lambda sent : ' '.join([word for word in sent.split() if word not in remove_words]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168074/168074 [00:00<00:00, 430061.29it/s]


### Save New Preprocessed data

In [13]:
df.to_csv(OUTPUT_DIR + 'processed_data.csv',index=False)

In [59]:
result = ''
for row in df['sentences']:
    row += ' '
    result += row

In [60]:
df = pd.read_csv('./../data/Validation.txt', sep=' ', names=['w1','w2','w3','w4'])
df2 = df[['w3','w4']].rename(columns={'w3':'w1', 'w4':'w2'})
df = pd.concat([df[['w1','w2']],df2], ignore_index=True)
df = df.drop_duplicates()

df3 = df[['w2']].rename(columns={'w2':'w1'})
df = pd.concat([df[['w1']],df2], ignore_index=True)
df = df.drop_duplicates()

df['w1'] = df['w1'].apply(lambda x : x.lower())
# df['w2'] = df['w2'].apply(lambda x : x.lower())

df['w1_present'] = df['w1'].apply(lambda x : x in result)
# df['w2_present'] = df['w2'].apply(lambda x : x in result)

print(df['w1_present'].value_counts())
# print(df['w2_present'].value_counts())

True     573
False      2
Name: w1_present, dtype: int64
