In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
big_stack = pd.read_csv('data/clean_stackoverflow_big.csv')

In [3]:
big_stack.head()

Unnamed: 0,question,summary,tags,user_id,no_of_votes,no_of_answers,date
0,multiple json objects into a list,"{""Id"": 1,""product"": ""Mobile"",""price"": 32889,""d...","['python', 'json']",diveinsky,0.0,1.0,2019-08-02
1,How to implement like button concept in viewpa...,I have a viewpager which displays images and b...,"['android', 'android-viewpager']",Marat Zangiev,1.0,1.0,2019-08-02
2,Is it a good practice using Observable with as...,I am using angular 2 common http that return a...,"['javascript', 'angular', 'typescript', 'promi...",Ashwin J Chhetri,48.0,2.0,2019-08-02
3,UBSan: Store to misaligned address; what is th...,"I've been running some code under UBSan, and f...",['ubsan'],Flamefire,0.0,1.0,2019-08-02
4,Django-Filter Form Displaying All Filters?,I've just started working with Django Filter. ...,"['django-filter', 'django-filters']",udeep shrestha,0.0,2.0,2019-08-02


In [4]:
big_stack.shape

(16955385, 7)

In [5]:
big_stack.isna().sum()

question            0
summary             2
tags                0
user_id          1235
no_of_votes       594
no_of_answers     594
date             1236
dtype: int64

### Data Prep & Feature Extraction

In [6]:
big_stack = big_stack.dropna()

In [7]:
# Target: Determining whether the question had been answered

big_stack['answers'] = (big_stack['no_of_answers'] >= 1).astype(int)

In [8]:
# Simple Feature 1: Determining the length of the question

big_stack['question_length'] = big_stack['question'].apply(lambda x : len(x))

In [9]:
# Simple Feature 2: Determining the length of the question summary

big_stack['summary_length'] = big_stack['summary'].apply(lambda x : len(x))

In [10]:
# Simple Feature 3: Does the Summary contain a code snippet?

def code_find(summary):
    
    if '\n\n' in summary:
        return 1
    else: 
        return 0 

In [11]:
big_stack['code_found'] = big_stack['summary'].apply(code_find)

In [12]:
# Simple Feature 4: Extracting the number of tags

big_stack['tags_count'] = big_stack['tags'].apply(lambda x : x.count(',')+1)

In [13]:
big_stack = big_stack.drop(columns =['summary','tags', 'user_id', 'no_of_votes', 'no_of_answers', 'date'])

In [14]:
big_stack.head()

Unnamed: 0,question,answers,question_length,summary_length,code_found,tags_count
0,multiple json objects into a list,1,33,204,0,2
1,How to implement like button concept in viewpa...,1,50,203,1,2
2,Is it a good practice using Observable with as...,1,56,201,1,5
3,UBSan: Store to misaligned address; what is th...,1,74,202,1,1
4,Django-Filter Form Displaying All Filters?,1,42,202,1,2


### Model Set Up

In [15]:
# Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
big_stack.columns

Index(['question', 'answers', 'question_length', 'summary_length',
       'code_found', 'tags_count'],
      dtype='object')

In [18]:
X = big_stack.index

train, test = train_test_split(X, test_size=0.25, random_state=101)

In [19]:
big_stack.loc[train,'split'] = 'train' 
big_stack.loc[test,'split'] = 'test'

In [20]:
big_stack.head(12)

Unnamed: 0,question,answers,question_length,summary_length,code_found,tags_count,split
0,multiple json objects into a list,1,33,204,0,2,train
1,How to implement like button concept in viewpa...,1,50,203,1,2,train
2,Is it a good practice using Observable with as...,1,56,201,1,5,train
3,UBSan: Store to misaligned address; what is th...,1,74,202,1,1,train
4,Django-Filter Form Displaying All Filters?,1,42,202,1,2,train
5,Run a script from inside a shell script,1,39,201,1,2,train
6,How to show connection line in d3.js map ('Err...,0,86,197,1,2,train
7,How do I upload a R dataframe as a CSV file on...,1,66,201,0,3,train
8,How can I set the version of a NuGet nuspec <d...,0,79,200,0,2,train
9,adding a custom header to ray wenderlich Pinte...,1,57,203,0,3,train


In [21]:
# Extracting features only
# [Turning pd.DataFrame into sciPy Sparse Matrix]

In [22]:
sparse_feat = big_stack.iloc[:, 1:7]

In [23]:
sparse_feat.head()

Unnamed: 0,answers,question_length,summary_length,code_found,tags_count,split
0,1,33,204,0,2,train
1,1,50,203,1,2,train
2,1,56,201,1,5,train
3,1,74,202,1,1,train
4,1,42,202,1,2,train


In [24]:
sparse_feat_train = sparse_feat[sparse_feat['split']== 'train']
sparse_feat_test = sparse_feat[sparse_feat['split']== 'test']

In [25]:
sparse_feat_train = sparse_feat_train.drop(columns = ['answers', 'split'])
sparse_feat_test = sparse_feat_test.drop(columns = ['answers', 'split'])

In [26]:
sparse_feat_train.head()

Unnamed: 0,question_length,summary_length,code_found,tags_count
0,33,204,0,2
1,50,203,1,2
2,56,201,1,5
3,74,202,1,1
4,42,202,1,2


In [27]:
# Scaling the new features

In [28]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(sparse_feat_train)

feat_train = scaler.transform(sparse_feat_train)
feat_test = scaler.transform(sparse_feat_test)

In [29]:
# Convert Scaled features from numpy array to sparse matrix

In [30]:
from scipy.sparse import csr_matrix

sp_feat_train = csr_matrix(feat_train)
sp_feat_test = csr_matrix(feat_test)

In [31]:
sp_feat_train

<12714684x4 sparse matrix of type '<class 'numpy.float64'>'
	with 50858736 stored elements in Compressed Sparse Row format>

### Model w/o additonal features - TFIDF Vectorizer followed by Logistic Regression

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [33]:
idx_train = big_stack['split'] == 'train'
idx_test = big_stack['split'] == 'test'

In [34]:
def tokenizer(text):
    
    found_tokens = re.findall(r'[a-z0-9]+', text.lower())
    
    tokens_list = [tokens for tokens in found_tokens if len(tokens) <= 15]
    
    return tokens_list

In [35]:
pipeline = Pipeline(steps=[('vectorizer', TfidfVectorizer(tokenizer=tokenizer, stop_words = 'english')), 
                          ('logreg', LogisticRegression(class_weight = 'balanced'))])

In [36]:
X_train = big_stack.loc[idx_train, 'question'].values
X_test = big_stack.loc[idx_test, 'question'].values

y_train = big_stack.loc[idx_train, 'answers'].values
y_test = big_stack.loc[idx_test, 'answers'].values

In [37]:
%%time
pipeline.fit(X_train, y_train)



CPU times: user 17min 31s, sys: 14.2 s, total: 17min 46s
Wall time: 12min 53s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token...\\w+\\b',
                                 tokenizer=<function tokenizer at 0x170edc0d0>,
                                 use_idf=True, vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                                    fit_

In [38]:
%%time
y_pred = pipeline.predict(X_test)

CPU times: user 1min, sys: 2.34 s, total: 1min 2s
Wall time: 1min 2s


In [39]:
# Model Interpretation

In [40]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [41]:
accuracy_score(y_test, y_pred)

0.6212976272159025

In [42]:
confusion_matrix(y_test, y_pred)

array([[ 408958,  248183],
       [1356844, 2224243]])

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.23      0.62      0.34    657141
           1       0.90      0.62      0.73   3581087

    accuracy                           0.62   4238228
   macro avg       0.57      0.62      0.54   4238228
weighted avg       0.80      0.62      0.67   4238228



### Model w/ additonal features

In [44]:
# Adding the prepped features

In [65]:
from scipy.sparse import hstack

In [68]:
X_train = hstack((X_train, sp_feat_train), format='csr')
X_test = hstack((X_test, sp_feat_test), format='csr')

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 12714684, expected 1.

In [53]:
X_train.shape

(12714684,)

In [54]:
sp_feat_train.shape

(12714684, 4)

In [None]:
%%time
pipeline.fit(X_train, y_train)

In [None]:
%%time
y_pred = pipeline.predict(X_test)

In [None]:
# Model Interpretation

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))