In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
big_stack = pd.read_csv('data/clean_stackoverflow_big.csv')

In [3]:
big_stack.head()

Unnamed: 0,question,summary,tags,user_id,no_of_votes,no_of_answers,date
0,multiple json objects into a list,"{""Id"": 1,""product"": ""Mobile"",""price"": 32889,""d...","['python', 'json']",diveinsky,0.0,1.0,2019-08-02
1,How to implement like button concept in viewpa...,I have a viewpager which displays images and b...,"['android', 'android-viewpager']",Marat Zangiev,1.0,1.0,2019-08-02
2,Is it a good practice using Observable with as...,I am using angular 2 common http that return a...,"['javascript', 'angular', 'typescript', 'promi...",Ashwin J Chhetri,48.0,2.0,2019-08-02
3,UBSan: Store to misaligned address; what is th...,"I've been running some code under UBSan, and f...",['ubsan'],Flamefire,0.0,1.0,2019-08-02
4,Django-Filter Form Displaying All Filters?,I've just started working with Django Filter. ...,"['django-filter', 'django-filters']",udeep shrestha,0.0,2.0,2019-08-02


In [4]:
big_stack.shape

(16955385, 7)

In [5]:
big_stack.isna().sum()

question            0
summary             2
tags                0
user_id          1235
no_of_votes       594
no_of_answers     594
date             1236
dtype: int64

In [6]:
big_stack = big_stack.dropna()

In [7]:
# Defining the Target - Has the question been answered?

big_stack['answers'] = (big_stack['no_of_answers'] >= 1).astype(int)

In [8]:
big_stack.head()

Unnamed: 0,question,summary,tags,user_id,no_of_votes,no_of_answers,date,answers
0,multiple json objects into a list,"{""Id"": 1,""product"": ""Mobile"",""price"": 32889,""d...","['python', 'json']",diveinsky,0.0,1.0,2019-08-02,1
1,How to implement like button concept in viewpa...,I have a viewpager which displays images and b...,"['android', 'android-viewpager']",Marat Zangiev,1.0,1.0,2019-08-02,1
2,Is it a good practice using Observable with as...,I am using angular 2 common http that return a...,"['javascript', 'angular', 'typescript', 'promi...",Ashwin J Chhetri,48.0,2.0,2019-08-02,1
3,UBSan: Store to misaligned address; what is th...,"I've been running some code under UBSan, and f...",['ubsan'],Flamefire,0.0,1.0,2019-08-02,1
4,Django-Filter Form Displaying All Filters?,I've just started working with Django Filter. ...,"['django-filter', 'django-filters']",udeep shrestha,0.0,2.0,2019-08-02,1


In [9]:
# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
big_stack.columns

Index(['question', 'summary', 'tags', 'user_id', 'no_of_votes',
       'no_of_answers', 'date', 'answers'],
      dtype='object')

In [12]:
X = big_stack.index

train, test = train_test_split(X, test_size=0.25, random_state=101)

In [13]:
big_stack.loc[train,'split'] = 'train' 
big_stack.loc[test,'split'] = 'test'

In [14]:
# Model & Pipeline

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [16]:
idx_train = big_stack['split'] == 'train'
idx_test = big_stack['split'] == 'test'

In [17]:
def tokenizer(text):
    
    found_tokens = re.findall(r'[a-z0-9]+', text.lower())
    
    tokens_list = [tokens for tokens in found_tokens if len(tokens) <= 15]
    
    return tokens_list

In [18]:
X_train = big_stack.loc[idx_train, 'question'].values
X_test = big_stack.loc[idx_test, 'question'].values

y_train = big_stack.loc[idx_train, 'answers'].values
y_test = big_stack.loc[idx_test, 'answers'].values

In [19]:
pipeline = Pipeline(steps=[('vectorizer', TfidfVectorizer(tokenizer=tokenizer, stop_words = 'english')), 
                          ('logreg', LogisticRegression(class_weight = 'balanced'))])

In [20]:
%%time
pipeline.fit(X_train, y_train)



CPU times: user 20min 24s, sys: 36.1 s, total: 21min
Wall time: 16min 42s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token...\\w+\\b',
                                 tokenizer=<function tokenizer at 0x16b994048>,
                                 use_idf=True, vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                                    fit_

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [22]:
%%time
y_pred = pipeline.predict(X_test)

CPU times: user 1min 9s, sys: 4.96 s, total: 1min 14s
Wall time: 1min 20s


In [23]:
accuracy_score(y_test, y_pred)

0.6212976272159025

In [24]:
confusion_matrix(y_test, y_pred)

array([[ 408958,  248183],
       [1356844, 2224243]])

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.23      0.62      0.34    657141
           1       0.90      0.62      0.73   3581087

    accuracy                           0.62   4238228
   macro avg       0.57      0.62      0.54   4238228
weighted avg       0.80      0.62      0.67   4238228



### Model Demo

In [26]:
# Let's play and see whether our model can predict if a question we ask, is going to be answered

In [27]:
from lime.lime_text import LimeTextExplainer
import ipywidgets as widgets
from IPython.display import display

In [31]:
text = widgets.Text(value='',
                   placeholder='Type your Question',
                   description='Question:',
                   disabled=False)

display(text)


def answer_probability(wdgt):
    
    new_text = wdgt.value
    
    explainer = LimeTextExplainer(class_names=['No answer','Answer'])
    exp = explainer.explain_instance(new_text,
                                     pipeline.predict_proba,
                                     num_features=6)

    exp.show_in_notebook(text = True)
    
text.on_submit(answer_probability)

Text(value='', description='Question:', placeholder='Type your Question')