<center><h1>Stack Overflow Search Engine</h1></center>

In [0]:
#installing packages (since i'm using google colab,i'm installing from notebook itself)
import nltk
nltk.download('vader_lexicon')

In [0]:
#importing packages
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import multiprocessing as mp
import heapq
from wordcloud import WordCloud
import gensim
from gensim.models.word2vec import Word2Vec

from google.colab import auth
from google.cloud import bigquery
from google.colab import drive

from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

<h1>Mounting the Drive </h1>

In [0]:
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


<h1>Variables</h1>

In [0]:
currentDirectory = "/content/drive/My Drive/pcase_study_1/"
dataDirectory = currentDirectory + "data/"
modelsDirectory = currentDirectory + "models/"

#<h1>1. Featurization<h1>

In [0]:
X_train = pd.read_csv(dataDirectory + 'train_data.csv')
X_test = pd.read_csv(dataDirectory + 'test_data.csv')

<h2>1.1 Sentiment Analysis</h2>

<p>these features taken from <a href='https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/'>Analyticsvidhya</a></p>

<p>These fetures will be used in retriving similar results. i.e we will giving more pripority to datapoint which has high polairity.</p>


<p>Sentiment analysis is basically the process of determining the attitude or the emotion of the writer, i.e., whether it is positive or negative or neutral.

The sentiment function of textblob returns two properties, polarity, and subjectivity.

Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement. Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].</p>

In [0]:
def CreateSentimentFeatures(text):
    sentiment = TextBlob(text).sentiment
    tempSentimentScores = [sentiment.polarity, sentiment.subjectivity]
    return tempSentimentScores

<h2>1.2 Polarity Scores</h2>

In [0]:
# we can use these 4 things as features/attributes (neg, neu, pos, compound)
# neg: 0.0, neu: 0.753, pos: 0.247, compound: 0.93
def CreatePolarityFeatures(text):
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    tempPolarityScores = [ss['neg'],ss['neu'],ss['pos'],ss['compound']]
    return tempPolarityScores

<h2>1.3 Creating above featues on train and test data</h2>

In [0]:
X_train = X_train.assign(**{'sentiment_polarity': 0.0, 'sentiment_subjectivity': 0.0,'neg':0.0,'neu':0.0,'pos':0.0,'compound':0.0})
X_test  =  X_test.assign(**{'sentiment_polarity': 0.0, 'sentiment_subjectivity': 0.0,'neg':0.0,'neu':0.0,'pos':0.0,'compound':0.0})

In [0]:
def createScores(data):        
    for idx in range(len(data)):
        text = data['total_text'][idx]
        tempSentimentScores = CreateSentimentFeatures(text)
        tempPolarityScores = CreatePolarityFeatures(text)
        data['sentiment_polarity'][idx] = tempSentimentScores[0]
        data['sentiment_subjectivity'][idx] = tempSentimentScores[1]
        data['neg'][idx] = tempPolarityScores[0]
        data['neu'][idx] = tempPolarityScores[1]
        data['pos'][idx] = tempPolarityScores[2]
        data['compound'][idx] = tempPolarityScores[3]
    
    return data

In [0]:
X_train = createScores(X_train)
X_test = createScores(X_test)

In [0]:
X_train.to_csv(dataDirectory + 'train_data.csv',encoding='utf-8',index=False)
X_test.to_csv(dataDirectory + 'test_data.csv',encoding='utf-8',index=False)

#<h1>2. Training Word2Vec Model</h1>

In [0]:
data = pd.read_csv(dataDirectory + 'train_data.csv')

In [0]:
# Create the Word2Vec model
w2v_model = Word2Vec(size=300,window=5,min_count=10, workers=16)

corpus = [_doc.split() for _doc in data['total_text'].values]

w2v_model.build_vocab(corpus)
words = w2v_model.wv.vocab.keys()

vocab_size = len(words)
print("Vocab size", vocab_size)

# Train Word Embeddings
w2v_model.train(corpus, total_examples=len(corpus), epochs=128)
#Saving Model
w2v_model.save(modelsDirectory+'w2v.bin')

Vocab size 346928


In [0]:
#reading Pre-Trained W2V
model_w2v = gensim.models.word2vec.Word2Vec.load(modelsDirectory+'w2v.bin') #reading pre-trained

In [0]:
print("words similar to python")
print(model_w2v.wv.most_similar("python"))

print("*"*80)

print("words similar to java")
print(model_w2v.wv.most_similar("java"))

print("*"*80)

print("words similar to pandas")
print(model_w2v.wv.most_similar("pandas"))

print("*"*80)

print("words similar to machine")
print(model_w2v.wv.most_similar("machine"))

print("*"*80)

print("words similar to dictionary")
print(model_w2v.wv.most_similar("dictionary"))

words similar to python
[('python3', 0.7309698462486267), ('pythons', 0.6681704521179199), ('cpython', 0.6292317509651184), ('python2', 0.6141325235366821), ('ironpython', 0.5694214701652527), ('27', 0.5649034976959229), ('ipython', 0.55638587474823), ('cython', 0.554825484752655), ('interpreter', 0.5260359048843384), ('perl', 0.5202875137329102)]
********************************************************************************
words similar to java
[('jdk', 0.6340391635894775), ('javafx', 0.5775982141494751), ('scala', 0.5566918849945068), ('j2me', 0.532119631767273), ('jvm', 0.529885470867157), ('groovy', 0.5262027382850647), ('j2se', 0.5213099122047424), ('android', 0.5148731470108032), ('clojure', 0.512725830078125), ('java8', 0.5097906589508057)]
********************************************************************************
words similar to pandas
[('panda', 0.7063027024269104), ('numpy', 0.6673734188079834), ('dataframe', 0.6504719853401184), ('df', 0.6102957725524902), ('groupb