In [12]:
import pandas as pd
import sqlite3
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import datetime as dt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from datetime import datetime
from tqdm import tqdm
import scipy.sparse
import pickle
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

In [2]:
def striphtml(data):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(data))
    return cleantext

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def preprocess_title(Title):
    Title=striphtml(Title.encode('utf-8'))
    Title=re.sub(r'[^A-Za-z0-9#+.\-]+',' ',Title)
    words=word_tokenize(str(Title.lower()))
    Title=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c' or j=='r'))
    return Title

def preprocess_body(Body):
    is_code=0
    code_str=""
    if '<code>' in Body:
        is_code = 1
        code = re.findall('<code>(.*?)</code>', Body, flags=re.MULTILINE|re.DOTALL)
        code_str = code_str.join(code)
        
    question=re.sub('<code>(.*?)</code>', '', Body, flags=re.MULTILINE|re.DOTALL)
        
    question=striphtml(question.encode('utf-8'))
    question=re.sub(r'[^A-Za-z0-9#+.\-]+',' ',question)
    words=word_tokenize(str(question.lower()))
    question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c' or j=='r'))
    
    sent=""
    dup = dict()
    for ch in code_str:
        ch=ch.lower()
        if(ch.isalpha()):
            sent+=(ch)
        if(ch in [' ','.','(',')','{','}','[',']','_','-','/','$','&','<','>',':',';','/','\\',"'",'?','!','@','#','%','^','*','+','=','|']):
            if(ch not in dup):
                dup[ch]=(int)(1)
            if(dup.get(ch)<=5):
                dup[ch]=(int)(dup.get(ch)+1)
                sent+=" "+ch+" "
            else:
                sent+=" "

    dup = dict()
    sent1=""
    for ch in sent.split():
        if(ch not in dup):
            dup[ch]=(int)(1)
        if(dup.get(ch)<=10):
            dup[ch]=(int)(dup.get(ch)+1)
            sent1+=" "+ch+" "
        else:
            sent1+=" "


    sent1=' '.join(sent1.split())

    
    return sent1,question

In [3]:
def get_tags(title, code, question):
    text = title+" "+title+" "+title+" "+question+" "+code
    vector = tf1_new.transform([text])
    pred = model.predict(vector)

    output=[]
    for tag,idx in zip(tags,pred.toarray()[0]):
        if(idx==1):
            output.append(tag)
    return output

### Loading all models and files

In [14]:
tags=""
with open('model/tags_list.txt', 'r') as f:
    tags+=(f.read())
tags = tags.split()
tags = tags[:100]
print("Tags Loaded!")

with open("model/LR_tfidf_3title_question_code_model.pkl",'rb') as f:
    model = pickle.load(f)
print("Model Loaded!")

tf1_vocal = pickle.load(open("model/x_tfidf_train_multilabel_vocal.pickle", 'rb'))
tf1_idf = pickle.load(open("model/x_tfidf_train_multilabel_idf.pickle", 'rb'))

tf1_new = TfidfVectorizer(min_df=0.00009, max_features=400000, tokenizer = lambda x: x.split(), ngram_range=(1,4), vocabulary=tf1_vocal)
tf1_new._tfidf._idf_diag = sp.spdiags(tf1_idf, diags = 0, m = len(tf1_idf), n = len(tf1_idf))

print("Vectorizer loaded!")

Tags Loaded!
Model Loaded!
Vectorizer loaded!


### Taking input Title and Body and Predicting Tags:

In [15]:
print("Enter Title")
title = input()
print("Enter Body")
body = input()

title = preprocess_title(title)
code,question = preprocess_body(body)

output = get_tags(title, code, question)

print()
print("TAGS ASSOCIATED ARE:")
print(output)

Enter Title
How to fetch an XML feed using asp.net
Enter Body
<p>I've decided to convert a Windows Phone 7 app that fetches an XML feed and then parses it to an asp.net web app, using Visual Web Developer Express. I figure since the code already works for WP7, it should be a matter of mostly copying and pasting it for the C# code behind. </p>  <pre><code>HttpWebRequest request = HttpWebRequest.CreateHttp("http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&amp;a=sf-muni&amp;r=" + line1); </code></pre>  <p>That's the first line of code from my WP7 app that fetches the XML feed, but I can't even get HttpWebRequest to work in Visual Web Developer like that. Intellisense shows a create and createdefault, but no CreateHttp like there was in Windows Phone 7. I just need to figure out how to fetch the page, I assume the parsing will be the same as on my phone app. Any help?</p>  <p>Thanks,</p>  <p>Amanda</p>

TAGS ASSOCIATED ARE:
['c#', 'asp.net']


In [28]:
# How to fetch an XML feed using asp.net

# <p>I've decided to convert a Windows Phone 7 app that fetches an XML feed and then parses it to an asp.net web app, using Visual Web Developer Express. I figure since the code already works for WP7, it should be a matter of mostly copying and pasting it for the C# code behind. </p>  <pre><code>HttpWebRequest request = HttpWebRequest.CreateHttp("http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&amp;a=sf-muni&amp;r=" + line1); </code></pre>  <p>That's the first line of code from my WP7 app that fetches the XML feed, but I can't even get HttpWebRequest to work in Visual Web Developer like that. Intellisense shows a create and createdefault, but no CreateHttp like there was in Windows Phone 7. I just need to figure out how to fetch the page, I assume the parsing will be the same as on my phone app. Any help?</p>  <p>Thanks,</p>  <p>Amanda</p>