#### Imports

In [1]:
#downloading data imports
from __future__ import print_function, division
import requests
import re
import wget
import pandas as pd
import numpy as np
import xmltodict
import warnings
import itertools
from bs4 import BeautifulSoup
warnings.filterwarnings('ignore')
#LSTM model imports
import os
import nltk
import gensim
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer
import pickle
from nltk.stem.wordnet import WordNetLemmatizer

#### Helper functions

In [3]:
def get_urls(site, keyword_string):
    soup = to_soup(site)
    hrefs = soup.find_all('a', class_=re.compile(keyword_string))
    links = []
    for anchor in hrefs:
        links.append(anchor['href'])
    links = ['https://archive.org' + x for x in links]
    return links

def to_soup(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup
def postsxmltodf(path):
    questionlist = []
    answerlist = []
    with open(path) as fd:
        
        doc = xmltodict.parse(fd.read())
        docref = doc['posts']['row']
        for i in range(len(docref)):
            PostTypeId = BeautifulSoup(docref[i]['@PostTypeId']).text
            body = BeautifulSoup(docref[i]['@Body']).text
            ID = BeautifulSoup(docref[i]['@Id']).text
            if PostTypeId == '1':
                title = BeautifulSoup(docref[i]['@Title']).text
                questionlist.append((ID,PostTypeId,title,body))
            elif PostTypeId == '2':
                parentID = BeautifulSoup(docref[i]['@ParentId']).text
                answerlist.append((ID,PostTypeId,parentID,body))
    qdf = pd.DataFrame(questionlist, columns=['ID','PostTypeID', 'Title', 'Body'])
    adf = pd.DataFrame(answerlist, columns=['ID','PostTypeID', 'parentID', 'Body'])
    return qdf, adf


## Getting Download Links and Retrieving Data

#### Script to download individual data dump files

In [22]:
url = 'https://archive.org/details/stackexchange'

urllist = list(get_urls(url, 'stealth download-pill'))

In [24]:
urllist[:10]

['https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/academia.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/ai.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/android.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/anime.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/apple.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/arabic.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/arduino.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/askubuntu.com.7z',
 'https://archive.org/download/stackexchange/astronomy.stackexchange.com.7z']

In [25]:
searchterm = 'lifehack'

In [29]:
my_link = [x for x in urllist if searchterm in x and 'meta' not in x]

['https://archive.org/download/stackexchange/lifehacks.stackexchange.com.7z']

In [None]:
wget.download(my_link[0])

#### Download all data dump files

In [None]:
non_meta_dl_links = [x for x in urllist if 'meta' not in x]

In [None]:
for i in urllist[urllist.URL.str.contains("meta") == False]['URL']:
    wget.download(i)

## Processing of downloaded xml file

#### Import xml download directory

In [3]:
lifehacks = '~/lifehacks.stackexchange.com/Posts.xml'

In [4]:
q_lh_df, a_lh_df = postsxmltodf(lifehacks)

In [5]:
q_lh_df.columns = ['ID_q', 'PostTypeID_q', 'Title', 'Body']
a_lh_df.columns = ['ID_a', 'PostTypeID_a', 'parentqID', 'Body']

#### Word2vec import and preprocessing

* Using Google word2vec for encoding word vectors

* Into 2 lists of questions and answers to attempt an implementation of a Keras LSTM model by https://github.com/shreyans29/ available here: https://github.com/shreyans29/thesemicolon


In [13]:
os.chdir("/Users/RGD/Downloads/")
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [7]:
q2df = q_lh_df.set_index('ID_q').drop('Body', axis =1)

In [8]:
matcheddf = a_lh_df.join(q2df, on='parentqID')

In [9]:
qlist = matcheddf.set_index('parentqID').iloc[:,4].tolist()
alist = matcheddf.set_index('parentqID').iloc[:,2].tolist()

In [14]:
print(len(qlist),len(alist))

6226 6226


In [20]:
alistclean = []

In [21]:
for i in alist:
    alistclean.append(''.join(list(filter(None, i.split('\n')))))

In [34]:
tok_q = []
tok_a = []

In [38]:
for i in range(len(qlist)):
    tok_q.append(nltk.word_tokenize(qlist[i].lower()))
    tok_a.append(nltk.word_tokenize(alistclean[i].lower()))

In [39]:
vec_x=[]
for sent in tok_q:
    sentvec = [model[w] for w in sent if w in model.vocab]
    vec_x.append(sentvec)
    
vec_y=[]
for sent in tok_a:
    sentvec = [model[w] for w in sent if w in model.vocab]
    vec_y.append(sentvec)

In [41]:
sentend=np.ones((300,),dtype=np.float32)

In [42]:
for tok_sent in vec_x:
    tok_sent[14:]=[]
    tok_sent.append(sentend)

In [43]:
for tok_sent in vec_x:
    if len(tok_sent)<15:
        for i in range(15-len(tok_sent)):
            tok_sent.append(sentend) 

In [44]:
for tok_sent in vec_y:
    tok_sent[14:]=[]
    tok_sent.append(sentend)

In [45]:
for tok_sent in vec_y:
    if len(tok_sent)<15:
        for i in range(15-len(tok_sent)):
            tok_sent.append(sentend)

In [None]:
with open('conversation.pickle','wb') as f:
    pickle.dump([vec_x,vec_y],f)

#### Writing questions and answers to a txt file as alternating pairs for TensorFlow seq2seq wrapper

In [22]:
result = [None]*(len(qlist)+len(alist))
result[::2] = qlist
result[1::2] = alistclean

In [23]:
with open('lifehacks.txt', 'w') as wr:
    for i in result:
        wr.write(i+'\n')

In [16]:
result

['How can I safely trim my fingernails without a fingernail clipper?',
 "As you pointed out, a knife is not a good idea. You can use a standard pair of scissors safely though.\nIf you don't have that, then (as much as I hate to say such a thing) bite them.\n",
 'How can I clean my sticky keyboard?']