#Read Data and feature engineering

In [2]:
import sqlite3
import pickle
import pandas as pd

In [17]:
conn = sqlite3.connect('../Data/crossvalidated.db')

In [18]:
# return all the records for questions posts from posts table
ques_query = "SELECT * FROM [posts] WHERE PostTypeId==2"

In [19]:
apost_df = pd.read_sql_query(ques_query, conn)

In [20]:
apost_df.shape

(74331, 21)

In [21]:
apost_df.columns

Index([u'Body', u'ViewCount', u'LastEditorDisplayName', u'ClosedDate',
       u'CommunityOwnedDate', u'LastEditorUserId', u'ParentID',
       u'LastEditDate', u'CommentCount', u'AnswerCount', u'AcceptedAnswerId',
       u'Score', u'OwnerDisplayName', u'Title', u'PostTypeId', u'OwnerUserId',
       u'Tags', u'CreationDate', u'FavoriteCount', u'Id', u'LastActivityDate'],
      dtype='object')

In [22]:
apost_df.drop(['LastEditorDisplayName','CommunityOwnedDate','LastEditorUserId','LastEditDate',
             'LastActivityDate'],axis=1,inplace=True)

In [23]:
#no closed date for answer
apost_df[apost_df.ClosedDate.isnull()].shape

(74331, 16)

In [9]:
bins = [-36, -1, 0, 2, 15, 260]
group_names = ['bad','neutral','satisfactory','good','awesome']
apost_df['AnsQuality']= pd.cut(apost_df['Score'],bins,labels=group_names)

In [10]:
apost_df.AnsQuality.value_counts()

satisfactory    32360
good            25736
neutral         13579
awesome          1759
bad               897
dtype: int64

In [8]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
from textblob import TextBlob, Word
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import re
import string

In [4]:
stop_words = pd.read_csv("../Data/stoplist copy.csv",header=None)

In [6]:
stop_words = stop_words[0].tolist()

In [9]:
stop_words = set(stop_words + stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))

In [13]:
with open('../Data/stop_words.pickle', 'rb') as handle:
  stop_words = pickle.load(handle)

In [10]:
#still may want to hand craft a little bit
stop_words = stop_words.union(set(['don','le', 'isthe', 'likeif','ll','ve','cohen','se','setof','isn']))

In [11]:
#be careful , like p-value, t-distribution
stop_words = stop_words - set(['p','t'])

In [12]:
import pickle
with open('../Data/stop_words.pickle', 'wb') as handle:
    pickle.dump(stop_words, handle)

In [24]:
verb_exp = set(['VB', 'VBZ', 'VBP', 'VBD','VBN','VBG'])
#porter_stemmer = PorterStemmer()
def clean_text(row):
    soup = BeautifulSoup(row, 'html.parser')
    #remove code
    for tag in soup.find_all('code'):
        tag.replaceWith(' ')
        
    raw = soup.get_text().lower()
    #remove link
    raw_no_link = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', raw)
    #remove email
    no_link_email = re.sub(r'[\w\.-]+@[\w\.-]+[\.][com|org|ch|uk]{2,3}', "", raw_no_link)
    #remove whitespace
    tab_text = '\t\n\r\x0b\x0c'
    no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
    #remove fomula
    reg = '(\$.+?\$)|((\\\\begin\{.+?\})(.+?)(\\\\end\{(.+?)\}))'
    raw = re.sub(reg, "", no_link_email_space, flags=re.IGNORECASE)   
    #remove numbers
    raw = re.sub('[0-9]+?', ' ', raw) 
    # remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    raw = regex.sub(' ', raw)
    #clean out the characters left out after the above step, like we’re, I’m, It’s, i.e., isn't
    raw = re.sub('( s )|( re )|( m )|( i e )|(n t )',' ',raw) 
    
    # lementize
    row_t = TextBlob(raw)
    raw = []
    for word, pos in row_t.tags:
        if pos in verb_exp:
            word = Word(word)
            word = word.lemmatize("v")
        else:
            word = Word(word)
            word = word.lemmatize()
        raw.append(word)
    clean = ' '.join(raw)   
    
    # remove stop words
    cleaned_text = " ".join([word for word in word_tokenize(clean) if word not in stop_words]) 
     
    return(cleaned_text)

In [56]:
apost_df['Body'][0]

u'<p>The R-project</p>\n\n<p><a href="http://www.r-project.org/">http://www.r-project.org/</a></p>\n\n<p>R is valuable and significant because it was the first widely-accepted Open-Source alternative to big-box packages.  It\'s mature, well supported, and a standard within many scientific communities.</p>\n\n<ul>\n<li><a href="http://www.inside-r.org/why-use-r">Some reasons why it is useful and valuable</a> </li>\n<li>There are some nice tutorials <a href="http://gettinggeneticsdone.blogspot.com/search/label/ggplot2">here</a>.</li>\n</ul>\n'

In [57]:
clean_text(apost_df['Body'][0])

u'r projectr valuable significant widely accept open source alternative big box package mature support standard scientific community reason valuable nice tutorial'

In [25]:
# get the cleaned body by removing stopwords and punctuation 
body_clean_sto_pun = apost_df['Body'].map(lambda i: clean_text(i))

In [67]:
body_clean_sto_pun[6]

u'project spring mind bug pain bayesian statistic user focus model bit mcmc bioconductor popular statistical tool bioinformatics r repository large number people learn r bioconductor number package cut edge analysis make'

In [59]:
type(body_clean_sto_pun)

pandas.core.series.Series

In [26]:
import pickle
with open('../Data/ans_clean_text.pickle', 'wb') as handle:
    pickle.dump(body_clean_sto_pun, handle)

In [62]:
ans_quality = apost_df['AnsQuality'] 

In [63]:
import pickle
with open('../Data/ans_quality.pickle', 'wb') as handle:
    pickle.dump(ans_quality, handle)