# For purpose of developing baseline models, BOW approach is also used

In [40]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import nltk
import re

In [41]:
data = pd.read_csv('../datasets/train_shortened.csv')
data.shape

(6079, 42)

In [42]:
# dropping all columns except question title, question, answer and ratings

cols = ['question_title', 'question_body', 'answer']
for i in data.columns[12:]:
    cols.append(i)
data = data[cols]

In [43]:
data.shape

(6079, 33)

In [44]:
question_titles = data['question_title'].values.copy()
questions = data['question_body'].values.copy()
answers = data['answer'].values.copy()

In [45]:
# adding title, question and answer as same document

X = []

for i in range(len(question_titles)):
    X.append(question_titles[i] + ' ' + questions[i] + ' ' + answers[i])

In [46]:
def text_preprocessor(texts):

    # final list
    texts_prep = []
    # chars to remove from text
    filters =['\'', '\"', '$', '%' ,'&', '(', ')', '*', ',', '+', '-', '/',':',';','<','=','>'
              ,'[',']','^','_','`','{','|','}','~','\t','\n', '@', '\\']

    # cleaning whitespace: only one (' ') remains
    whitespace_cleaner = re.compile(r'\s+')
    for i in range(len(texts)):
        texts[i] = whitespace_cleaner.sub(' ', texts[i]).strip()

    # for each text
    for text in texts:
        # split it by whitespace
        split = text.split(' ')
        # list to save preprocessed text
        to_add = []
        # going word by word
        for word in split:
            proc = []
            # making list of valid words, removing urls
            if word.isascii() and word.startswith('http') == False:
                x = list(word)
                # going char by char
                for i in x:
                    # if char is not to be filtered out, keep it, else add whitespace instead of it
                    if i not in filters:
                        # if .?!, add whitespace to it, to keep it as separate token
                        if i == '?' or i == '.' or i == '!':
                            proc.append(' ' + i)
                        else:
                            proc.append(i)
                    else:
                        proc.append(' ')
                    if '' in proc:
                        proc.remove('')
                # apending word to to_add list, which contains its whole text
                to_add.append(''.join(proc).lower())
                    
        # apending list of words back to all texts list
        texts_prep.append(' '.join(to_add).strip())
        
    # cleaning whitespace again: only one (' ') remains
    whitespace_cleaner = re.compile(r'\s+')
    for i in range(len(texts_prep)):
        texts_prep[i] = whitespace_cleaner.sub(' ', texts_prep[i]).strip()
        
    return texts_prep

In [47]:
X = text_preprocessor(X)

In [48]:
# POS tagging all words in tweets, and keeping tags only for nouns, verbs, adjectives and adverbs

def pos_tagger(texts):

    # procesed array-to-be
    texts_tagged = []

    for text in texts:
        # here each text is added after tagging
        tagged_final = []
        # splitting by whitespace and tagging
        split = text.split(' ')
        split = np.array(split)
        # some '' are stil left in list, cleaning them with numpy where
        where = np.where(split == '')
        split = np.delete(split, where)
        tagged = nltk.pos_tag(split)
        # keeping only tags for nouns, verbs, adjectives and adverbs
        for item in tagged:
            if item[1].startswith('N'):
                tagged_final.append((item[0],'n'))
            elif item[1].startswith('V'):
                tagged_final.append((item[0],'v'))
            elif item[1].startswith('J'):
                tagged_final.append((item[0],'a'))
            elif item[1].startswith('R'):
                tagged_final.append((item[0],'r'))
            else:
                tagged_final.append((item[0], None))
        # adding back to final list
        texts_tagged.append(tagged_final)
    
    return np.array(texts_tagged)

In [49]:
X = pos_tagger(X)

In [50]:
def lemmatize(texts):
    
    # using wordnet lemmatizer
    from nltk.stem import WordNetLemmatizer
    lem = WordNetLemmatizer()
    
    # lemmatized texts list to be
    texts_lemmed = []
    # going text by text, if word has valid tag (not None), lemmatize it
    for text in texts:
        to_add = []
        for i,j in text:
            if j == None:
                to_add.append(i)
            else:
                to_add.append(lem.lemmatize(i, pos = j))
        texts_lemmed.append(' '.join(to_add))
    
    return np.array(texts_lemmed)

In [51]:
X = lemmatize(X)

In [52]:
# seeing how many unique words there are

all_words = []

for i in X:
    x = i.split(' ')
    for j in x:
        all_words.append(j)
        
len(all_words)

1741104

In [53]:
# number of unique words

vocabulary = list(set(all_words))
len(vocabulary)

44125

In [54]:
freq_dist = nltk.FreqDist(all_words)
# keeping only words which show up 10 or more times in tweets as tokens, discarding rest

tokens = {k:v for k,v in freq_dist.items() if v >= 10}

len(tokens)

8277

In [55]:
vocabulary = list(tokens.keys())

In [56]:
# making tfidf matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
vec = TfidfVectorizer(vocabulary=vocabulary)

In [58]:
X = vec.fit_transform(X)

In [59]:
X.shape

(6079, 8277)

In [62]:
# dropping same 3 rows from x as in notebook 2

to_drop = [1297, 2094, 2748]

X = np.delete(X.toarray(), to_drop, axis = 0)

X.shape

(6076, 8277)

In [63]:
# pickling

import pickle

with open('../datasets/tfidf_inputs.pickle', 'wb') as f:
    pickle.dump(X, f)