In [None]:
from google.colab import drive
drive.mount('/content/drive')
project_folder = "./drive/My Drive/csc2515-project/"

Mounted at /content/drive


**Load and Preprocess Data**

In [None]:
import pandas as pd
import os
import numpy as np

questions = pd.read_csv('/content/drive/My Drive/csc2515-project/Questions.csv',  encoding='Latin-1')
questions.head(5)


Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [None]:
answers = pd.read_csv('/content/drive/My Drive/csc2515-project/Answers.csv',  encoding='Latin-1')
answers.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,92,61.0,2008-08-01T14:45:37Z,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,26.0,2008-08-01T16:09:47Z,80,12,<p>I wound up using this. It is a kind of a ha...
2,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...
3,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,307,49.0,2008-08-02T01:49:46Z,260,28,"<p><a href=""http://www.codeproject.com/Article..."


In [None]:
questions.columns=["QId", "QAskerId", "QCreationDate", "QClosedDate", "QScore", "QTitle", "QBody"]
questions.head(5)

Unnamed: 0,QId,QAskerId,QCreationDate,QClosedDate,QScore,QTitle,QBody
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [None]:
#Count number of answers for questions with answers
counts = answers['ParentId'].value_counts()
#Perform an inner join on question id between questions and answers for questions with 7 or more answers
questions_answers = pd.merge(questions,answers[answers['ParentId'].isin(counts.index[counts >=7])], left_on=['QId'], right_on=['ParentId'])
samples = questions_answers.shape[0]
print("Total samples numbers with 7+ answers:",samples)

Total samples numbers with 7+ answers: 120435


In [None]:
#Add Preprocessed Strings to Dataframe
cleanQuestions=[]
cleanAnswers=[]
currQ=questions_answers['QId'][0]
currString=preprocessString(questions_answers['QBody'][0])
for i in range(0,samples):
  if questions_answers['QId'][i]!=currQ:
    currQ=questions_answers['QId'][i]
    currString=preprocessString(questions_answers['QBody'][i])
  cleanQuestions.append(currString)
  cleanAnswers.append(preprocessString(questions_answers['Body'][i]))
questions_answers['CleanQuestion']=cleanQuestions
questions_answers['CleanAnswer']=cleanAnswers
questions_answers.head(10)

In [None]:
# Generate label for each answer, with True indicating the best answer
#labels = questions_answers.groupby(['QId'])['Score'].transform(max) == questions_answers['Score']
questions_answers['Labels'] = questions_answers.groupby(['QId'])['Score'].transform(max) == questions_answers['Score']
questions_answers['Best_Score'] = questions_answers[['Labels']] * 1
print(questions_answers.head(5).to_string())
# 13005

print(questions_answers['QId'].nunique())
print(questions_answers['Best_Score'].value_counts())
# Note that the number of unique True records is more than the number of questions for this part
#questions_answers.to_csv(os.path.join(project_folder,'Questions_Answers.csv'))

**MING FEATURE SELECTION**

In [None]:
#Find number of answers before and after
before=np.zeros((samples,1))
after=np.zeros((samples,1))
#Answers are already sorted by ID/date
before=questions_answers.groupby(['QId']).cumcount()
after=(questions_answers[::-1].groupby(['QId']).cumcount())[::-1]
print(before)
print(after)

0         0
1         1
2         2
3         3
4         4
         ..
120430    2
120431    3
120432    4
120433    5
120434    6
Length: 120435, dtype: int64
0         8
1         7
2         6
3         5
4         4
         ..
120430    4
120431    3
120432    2
120433    1
120434    0
Length: 120435, dtype: int64


In [None]:
#Cosine similarity with question
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf_vectorizer=TfidfVectorizer()
#Create tf_idf matrix that starts with tfidf vectors for each question, then the answers
tfidf_matrix=tfidf_vectorizer.fit_transform(pd.concat([(questions_answers.groupby('QId', as_index=False).agg('min'))['CleanQuestion'],questions_answers['CleanAnswer']]))
cs_question=np.zeros((samples,1))
currQ=questions_answers['QId'][0]
QIdx=0
QCount=questions_answers["QId"].nunique()
for i in range(samples):
  if questions_answers['QId'][i]!=currQ:
    QIdx+=1
    currQ=questions_answers['QId'][i]
  cs_question[i]=cosine_similarity(tfidf_matrix[QIdx],tfidf_matrix[QCount+i])
print(cs_question)

[[0.14024936]
 [0.16157731]
 [0.14238699]
 ...
 [0.14283855]
 [0.02836862]
 [0.01774712]]


In [None]:
#Average cosine similarity with other questions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf_matrix=tfidf_vectorizer.fit_transform(questions_answers['CleanAnswer'])
cs_answers=np.zeros((samples,1))
for i in range(samples):
  sum=0
  count=0
  #Iterate over every answer with the same question id
  for j in questions_answers.index[questions_answers['QId'] == questions_answers['QId'][i]].tolist():
    if questions_answers['Id'][j] != questions_answers['Id'][i]:
      sum+=cosine_similarity(tfidf_matrix[i],tfidf_matrix[j])
      count+=1
  #Average the cosine similarities
  cs_answers[i]=sum/count
print(cs_answers)



[[0.14412298]
 [0.11777672]
 [0.17608415]
 ...
 [0.06607784]
 [0.08609032]
 [0.04912072]]


In [None]:
d={'Before':before, 'After':after, 'Cosine Similarity to Question':np.transpose(cs_question)[0], 'Cosine Similarity to Answers':np.transpose(cs_answers)[0]}
ming_features= pd.DataFrame(data=d)
ming_features.head(10)

Unnamed: 0,Before,After,Cosine Similarity to Question,Cosine Similarity to Answers
0,0,8,0.140249,0.144123
1,1,7,0.161577,0.117777
2,2,6,0.142387,0.176084
3,3,5,0.060784,0.086928
4,4,4,0.192515,0.079836
5,5,3,0.090677,0.111903
6,6,2,0.097741,0.094995
7,7,1,0.16902,0.10406
8,8,0,0.135062,0.118351
9,0,8,0.338867,0.062817


In [None]:
ming_features.to_csv('/content/drive/MyDrive/csc2515-project/feature_extraction_Ming.csv')

**KARTHIK FEATURE SELECTION**

In [None]:
questions_answers.reset_index(drop=True, inplace=True)
questions_answers.shape
questions_answers.to_csv(os.path.join(project_folder, 'feature_extraction_Karthik.csv'))

In [None]:
cumulative_scores_dict=(answers.groupby('OwnerUserId')['Score'].sum()).to_dict()
total_answers_dict=(answers.groupby('OwnerUserId')['Score'].count()).to_dict()
total_questions_dict=(questions.groupby('QAskerId')['QScore'].count()).to_dict()

In [None]:
print(cumulative_scores_dict[1])

232


In [None]:
# User history
# Cumulative scores/answers/questions
def add_cumulative_score(group):
  group['Cumulative Answer Score'] = cumulative_scores_dict[group['OwnerUserId'].iloc[0]]
  return group

questions_answers = questions_answers.groupby('OwnerUserId', sort=False).apply(add_cumulative_score)
questions_answers.head(5)
#df_1['total_per_color']=df_1.groupby('color')['count'].transform('sum')

'''questions_answers = questions_answers.groupby('OwnerUserId')['Score'].transform('sum')

df.reset_index(drop=True, inplace=True)
df.shape
df.to_csv(os.path.join(project_folder, "eg.csv"))'''

#questions_answers['Cumulative Answer Score'] = 

'questions_answers = questions_answers.groupby(\'OwnerUserId\')[\'Score\'].transform(\'sum\')\n\ndf.reset_index(drop=True, inplace=True)\ndf.shape\ndf.to_csv(os.path.join(project_folder, "eg.csv"))'

In [None]:
questions_answers.to_csv(os.path.join(project_folder, 'feature_extraction_Karthik.csv'))

In [None]:
def add_cumulative_answers(group):
  group['Answer Count'] = total_answers_dict[group['OwnerUserId'].iloc[0]]
  return group

questions_answers = questions_answers.groupby('OwnerUserId', sort=False).apply(add_cumulative_answers)

questions_answers.reset_index(drop=True, inplace=True)
questions_answers.shape

(120435, 17)

In [None]:
questions_answers.head(10)

Unnamed: 0,QId,QAskerId,QCreationDate,QClosedDate,QScore,QTitle,QBody,Id,OwnerUserId,CreationDate,ParentId,Score,Body,Labels,Best_Score,Cumulative Answer Score,Answer Count
0,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,199.0,50.0,2008-08-01T19:36:46Z,180.0,1.0,<p>I've read somewhere the human eye can't dis...,False,0.0,111.0,3.0
1,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,529.0,86.0,2008-08-02T18:16:07Z,180.0,3.0,<p>Isn't it also a factor which order you set ...,False,0.0,14.0,5.0
2,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,539.0,157.0,2008-08-02T19:03:52Z,180.0,21.0,"<p>My first thought on this is ""how generate N...",True,1.0,1685.0,8.0
3,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,59760.0,5845.0,2008-09-12T19:00:13Z,180.0,17.0,<p>It would be best to find colors maximally d...,False,0.0,34.0,7.0
4,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,93908.0,16632.0,2008-09-18T16:01:24Z,180.0,7.0,"<p>Some related resources:</p>\n\n<p><a href=""...",False,0.0,165.0,6.0
5,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,143966.0,16582.0,2008-09-27T16:39:09Z,180.0,4.0,<p>Here is some code to allocate RGB colors ev...,False,0.0,16.0,5.0
6,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,1499720.0,115432.0,2009-09-30T18:00:26Z,180.0,0.0,"<p>Last I checked <a href=""http://www.jfree.or...",False,0.0,11.0,8.0
7,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,7815745.0,678455.0,2011-10-19T01:58:23Z,180.0,1.0,<p>I know this an old post but I found it whil...,False,0.0,8.0,3.0
8,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,21634390.0,135862.0,2014-02-07T17:43:13Z,180.0,0.0,"<p>To achieve ""most distinguishable"" we need t...",False,0.0,0.0,1.0
9,260.0,91.0,2008-08-01T23:22:08Z,,49.0,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,269.0,91.0,2008-08-01T23:49:57Z,260.0,4.0,"<p>Yes, I thought about that, but I soon figur...",False,0.0,281.0,27.0


In [None]:
questions_answers.to_csv(os.path.join(project_folder, 'feature_extraction_Karthik.csv'))

In [None]:
def add_cumulative_questions(group):
  if group['OwnerUserId'].iloc[0] in total_questions_dict:
    group['Questions Count'] = total_questions_dict[group['OwnerUserId'].iloc[0]]
  else:
    group['Questions Count']=0
  return group

questions_answers = questions_answers.groupby('OwnerUserId', sort=False).apply(add_cumulative_questions)

questions_answers.reset_index(drop=True, inplace=True)
questions_answers.shape

(120435, 18)

In [None]:
questions_answers.head(10)

Unnamed: 0,QId,QAskerId,QCreationDate,QClosedDate,QScore,QTitle,QBody,Id,OwnerUserId,CreationDate,ParentId,Score,Body,Labels,Best_Score,Cumulative Answer Score,Answer Count,Questions Count
0,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,199.0,50.0,2008-08-01T19:36:46Z,180.0,1.0,<p>I've read somewhere the human eye can't dis...,False,0.0,145.0,22.0,1.0
1,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,529.0,86.0,2008-08-02T18:16:07Z,180.0,3.0,<p>Isn't it also a factor which order you set ...,False,0.0,30.0,14.0,3.0
2,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,539.0,157.0,2008-08-02T19:03:52Z,180.0,21.0,"<p>My first thought on this is ""how generate N...",True,1.0,1759.0,24.0,1.0
3,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,59760.0,5845.0,2008-09-12T19:00:13Z,180.0,17.0,<p>It would be best to find colors maximally d...,False,0.0,45.0,15.0,0.0
4,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,93908.0,16632.0,2008-09-18T16:01:24Z,180.0,7.0,"<p>Some related resources:</p>\n\n<p><a href=""...",False,0.0,456.0,54.0,3.0
5,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,143966.0,16582.0,2008-09-27T16:39:09Z,180.0,4.0,<p>Here is some code to allocate RGB colors ev...,False,0.0,69.0,62.0,6.0
6,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,1499720.0,115432.0,2009-09-30T18:00:26Z,180.0,0.0,"<p>Last I checked <a href=""http://www.jfree.or...",False,0.0,61.0,21.0,0.0
7,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,7815745.0,678455.0,2011-10-19T01:58:23Z,180.0,1.0,<p>I know this an old post but I found it whil...,False,0.0,20.0,6.0,5.0
8,180.0,2089740.0,2008-08-01T18:42:19Z,,53.0,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,21634390.0,135862.0,2014-02-07T17:43:13Z,180.0,0.0,"<p>To achieve ""most distinguishable"" we need t...",False,0.0,84.0,26.0,8.0
9,260.0,91.0,2008-08-01T23:22:08Z,,49.0,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,269.0,91.0,2008-08-01T23:49:57Z,260.0,4.0,"<p>Yes, I thought about that, but I soon figur...",False,0.0,543.0,68.0,35.0


In [None]:
# Average user score (for their answers)
def find_user_average_score(scores, num_answers):
  return lambda row: 0.0 if row[num_answers] == 0 else float(row[scores]/row[num_answers])

questions_answers.loc[:, 'Average Score'] = questions_answers.apply(find_user_average_score('Cumulative Answer Score', 'Answer Count'), axis=1)      

In [None]:
(questions_answers[['QId','Cumulative Answer Score','Answer Count','Questions Count','Average Score']]).to_csv(os.path.join(project_folder, 'feature_extraction_Karthik.csv'))

In [None]:
questions_answers.reset_index(drop=True, inplace=True)
questions_answers.shape

(120435, 17)

In [None]:
questions_answers.to_csv(os.path.join(project_folder, 'feature_extraction_Karthik.csv'))

In [None]:
max = questions_answers.groupby('QId')['Score'].transform('max')

questions_answers['Normalized_Score'] = questions_answers['Score'].div(max)

In [None]:
questions_answers.reset_index(drop=True, inplace=True)
questions_answers.shape

(120435, 18)

In [None]:
print(len(questions_answers[questions_answers['QAskerId'] == 243.0]))

0
(120435, 20)


In [None]:
questions_answers.to_csv(os.path.join(project_folder, 'feature_extraction_Karthik.csv'))

**Tony FEATURE SELECTION**

In [None]:
#Use textfeatures 
!pip install textfeatures
import textfeatures as txf

txf.word_count(questions_answers,"Body","word_cnt")
questions_answers[["Body","word_cnt"]].head()
txf.char_count(questions_answers,"Body","char_cnt")
questions_answers[["Body","char_cnt"]].head()



Unnamed: 0,Body,char_cnt
0,<p>I've read somewhere the human eye can't dis...,1674
1,<p>Isn't it also a factor which order you set ...,275
2,"<p>My first thought on this is ""how generate N...",1308
3,<p>It would be best to find colors maximally d...,370
4,"<p>Some related resources:</p>\n\n<p><a href=""...",467


In [None]:
avg_char_length = np.zeros((samples,1))
avg_char_length = questions_answers['char_cnt']/ questions_answers['word_cnt']
questions_answers['avg_char_length']=avg_char_length
questions_answers.head(5)

Unnamed: 0,QId,QAskerId,QCreationDate,QClosedDate,QScore,QTitle,QBody,Id,OwnerUserId,CreationDate,ParentId,Score,Body,word_cnt,char_cnt,avg_char_length
0,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...,279,1674,6.0
1,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,529,86.0,2008-08-02T18:16:07Z,180,3,<p>Isn't it also a factor which order you set ...,52,275,5.288462
2,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,539,157.0,2008-08-02T19:03:52Z,180,21,"<p>My first thought on this is ""how generate N...",190,1308,6.884211
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,59760,5845.0,2008-09-12T19:00:13Z,180,17,<p>It would be best to find colors maximally d...,55,370,6.727273
4,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,93908,16632.0,2008-09-18T16:01:24Z,180,7,"<p>Some related resources:</p>\n\n<p><a href=""...",46,467,10.152174


In [None]:
#preprocessing on code attribute. <code> can only be counted when it's a stand alone phrase
questions_answers['Body'] = questions_answers['Body'].str.replace('<code>','<ccccode>')

In [None]:
#count urls and code sections
import nltk
nltk.download('punkt')
from textblob import TextBlob

url = np.zeros((samples,1))
code = np.zeros((samples,1))
for i in range(samples):
  blob = TextBlob(questions_answers.iloc[i,9])
  url[i] = blob.word_counts['http'] + blob.word_counts['href']
  code[i] = blob.word_counts['ccccode']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#append sentiment analysis result to dataframe as future training feature
questions_answers['urls']=url
questions_answers['CodeSections']=code
questions_answers.head(10)

Unnamed: 0,QId,QAskerId,QCreationDate,QClosedDate,QScore,QTitle,QBody,Id,OwnerUserId,CreationDate,ParentId,Score,Body,word_cnt,char_cnt,avg_char_length,urls,CodeSections
0,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...,279,1674,6.0,0.0,0.0
1,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,529,86.0,2008-08-02T18:16:07Z,180,3,<p>Isn't it also a factor which order you set ...,52,275,5.288462,0.0,0.0
2,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,539,157.0,2008-08-02T19:03:52Z,180,21,"<p>My first thought on this is ""how generate N...",190,1308,6.884211,0.0,0.0
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,59760,5845.0,2008-09-12T19:00:13Z,180,17,<p>It would be best to find colors maximally d...,55,370,6.727273,0.0,0.0
4,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,93908,16632.0,2008-09-18T16:01:24Z,180,7,"<p>Some related resources:</p>\n\n<p><a href=""...",46,467,10.152174,0.0,0.0
5,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,143966,16582.0,2008-09-27T16:39:09Z,180,4,<p>Here is some code to allocate RGB colors ev...,558,2591,4.643369,0.0,0.0
6,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,1499720,115432.0,2009-09-30T18:00:26Z,180,0,"<p>Last I checked <a href=""http://www.jfree.or...",49,312,6.367347,0.0,0.0
7,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,7815745,678455.0,2011-10-19T01:58:23Z,180,1,<p>I know this an old post but I found it whil...,214,969,4.528037,0.0,0.0
8,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,21634390,135862.0,2014-02-07T17:43:13Z,180,0,"<p>To achieve ""most distinguishable"" we need t...",66,433,6.560606,0.0,0.0
9,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur...",159,969,6.09434,0.0,0.0


**Generating features for questions:**

In [None]:
txf.word_count(questions_answers,"QBody","Q_word_cnt")
questions_answers[["QBody","Q_word_cnt"]]
txf.char_count(questions_answers,"QBody","Q_char_cnt")
questions_answers[["QBody","Q_char_cnt"]]

Unnamed: 0,QBody,Q_char_cnt
0,<p>This is something I've pseudo-solved many t...,260
1,<p>This is something I've pseudo-solved many t...,260
2,<p>This is something I've pseudo-solved many t...,260
3,<p>This is something I've pseudo-solved many t...,260
4,<p>This is something I've pseudo-solved many t...,260
...,...,...
120430,<p>I have an object:</p>\n\n<pre><code>var obj...,553
120431,<p>I have an object:</p>\n\n<pre><code>var obj...,553
120432,<p>I have an object:</p>\n\n<pre><code>var obj...,553
120433,<p>I have an object:</p>\n\n<pre><code>var obj...,553


In [None]:
Q_avg_char_length = np.zeros((samples,1))
Q_avg_char_length = questions_answers['Q_char_cnt']/ questions_answers['Q_word_cnt']
questions_answers['Q_avg_char_length']=Q_avg_char_length
questions_answers.head(5)

In [None]:
#preprocessing on code attribute. <code> can only be counted when it's a stand alone phrase
questions_answers['QBody'] = questions_answers['QBody'].str.replace('<code>','<ccccode>')


In [None]:
Q_url = np.zeros((samples,1))
Q_code = np.zeros((samples,1))
for i in range(samples):
  blob = TextBlob(questions_answers.iloc[i,3])
  Q_url[i] = blob.word_counts['http'] + blob.word_counts['href']
  Q_code[i] = blob.word_counts['ccccode']

In [None]:
#append sentiment analysis result to dataframe as future training feature
questions_answers['Q_urls']=Q_url
questions_answers['Q_CodeSections']=Q_code
#questions_answers.head(10)

Preprocessing for NLP features 

In [None]:
#preprocessing Question body using beautiful soup
import re
from bs4 import BeautifulSoup
import string
#Remove urls, html tags, code, and punctuation from strings
def preprocessString(text):
  text=text.lower()
  #Remove HTML Tags and Code
  soup=BeautifulSoup(text)
  for code in soup.find_all('code'):
    code.extract()
  text=soup.get_text()
  #Remove URLS
  text=re.sub('http://\S+|https://\S+', '', text)
  #Remove punctuation
  text=text.translate(str.maketrans('', '', string.punctuation))
  #Join lines together
  text=" ".join(line.strip() for line in text.splitlines())
  return text.strip()

In [None]:
#Use spacy package to calculate readability features
!pip install spacy-readability

Collecting spacy-readability
[?25l  Downloading https://files.pythonhosted.org/packages/68/9e/e8d9cdf0d54fa5fa0c6463bc6d0385c37deb5dc65a4cfe2c612a02a06869/spacy_readability-1.4.1-py3-none-any.whl (49kB)
[K     |██████▋                         | 10kB 19.0MB/s eta 0:00:01[K     |█████████████▏                  | 20kB 25.4MB/s eta 0:00:01[K     |███████████████████▊            | 30kB 30.1MB/s eta 0:00:01[K     |██████████████████████████▎     | 40kB 21.7MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.7MB/s 
[?25hCollecting syllapy<1,>=0
  Downloading https://files.pythonhosted.org/packages/11/31/e13c6b0ed7a95f46c3af20df2b995877ea179b88a90ec39122e0621dae08/syllapy-0.7.1-py3-none-any.whl
Collecting ujson<2.0,>=1.35
[?25l  Downloading https://files.pythonhosted.org/packages/16/c4/79f3409bc710559015464e5f49b9879430d8f87498ecdc335899732e5377/ujson-1.35.tar.gz (192kB)
[K     |████████████████████████████████| 194kB 17.4MB/s 
Building wheels for collected packages:

In [None]:
#readability method from spacy

import spacy

from spacy_readability import Readability
nlp = spacy.load('en')
nlp.add_pipe(Readability())


grade = np.zeros((samples,1))
dale_chall = np.zeros((samples,1))
ease = np.zeros((samples,1))


p =np.zeros((samples,1))
s =np.zeros((samples,1))

for x in range(samples):
  answer_text = preprocessString(questions_answers.iloc[x,9])
  docs = nlp(answer_text)
  grade[x] =docs._.flesch_kincaid_grade_level
  dale_chall[x]= docs._.dale_chall
  ease[x]= docs._.flesch_kincaid_reading_ease
  #semantic analysis from textblob
  blob = TextBlob(questions_answers.iloc[x,9])
  p[x] = blob.sentiment.polarity
  s[x] = blob.sentiment.subjectivity

In [None]:

questions_answers['GradeLevel']=grade       #https://en.wikipedia.org/wiki/Readability#The_Flesch_formulas
questions_answers['Dale_chall']=dale_chall  #https://en.wikipedia.org/wiki/Readability#The_Dale%E2%80%93Chall_formula
questions_answers['ReadingEase']=ease

questions_answers['polarity']=p
questions_answers['subjectivity']=s

questions_answers.head(20)

Unnamed: 0,QId,QScore,QTitle,QBody,AId,AOwnerUserID,ACreationDate,ParentId,AScore,ABody,word_cnt,char_cnt,avg_char_length,urls,CodeSections,Q_word_cnt,Q_char_cnt,Q_avg_char_length,Q_urls,Q_CodeSections,GradeLevel,Dale_chall,ReadingEase,polarity,subjectivity
0,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...,279,1674,6.0,0.0,2.0,41,260,6.341463,0.0,0.0,3.168889,10.894532,87.911453,-0.132211,0.451375
1,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,529,86.0,2008-08-02T18:16:07Z,180,3,<p>Isn't it also a factor which order you set ...,52,275,5.288462,0.0,0.0,41,260,6.341463,0.0,0.0,2.216136,7.189409,103.664205,0.166667,0.5
2,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,539,157.0,2008-08-02T19:03:52Z,180,21,"<p>My first thought on this is ""how generate N...",190,1308,6.884211,5.0,0.0,41,260,6.341463,0.0,0.0,7.489091,8.396844,71.324545,0.138636,0.416667
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,59760,5845.0,2008-09-12T19:00:13Z,180,17,<p>It would be best to find colors maximally d...,55,370,6.727273,0.0,0.0,41,260,6.341463,0.0,0.0,16.804091,12.464864,23.566136,0.1,0.27
4,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,93908,16632.0,2008-09-18T16:01:24Z,180,7,"<p>Some related resources:</p>\n\n<p><a href=""...",46,467,10.152174,4.0,0.0,41,260,6.341463,0.0,0.0,12.963913,11.605512,29.428188,0.233333,0.366667
5,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,143966,16582.0,2008-09-27T16:39:09Z,180,4,<p>Here is some code to allocate RGB colors ev...,558,2591,4.643369,2.0,1.0,41,260,6.341463,0.0,0.0,-2.926297,16.624229,125.361203,0.113333,0.6075
6,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,1499720,115432.0,2009-09-30T18:00:26Z,180,0,"<p>Last I checked <a href=""http://www.jfree.or...",49,312,6.367347,2.0,0.0,41,260,6.341463,0.0,0.0,6.337021,7.101226,77.533333,0.1,0.575
7,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,7815745,678455.0,2011-10-19T01:58:23Z,180,1,<p>I know this an old post but I found it whil...,214,969,4.528037,0.0,1.0,41,260,6.341463,0.0,0.0,-0.48558,12.228749,107.534308,0.025,0.639286
8,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,21634390,135862.0,2014-02-07T17:43:13Z,180,0,"<p>To achieve ""most distinguishable"" we need t...",66,433,6.560606,0.0,1.0,41,260,6.341463,0.0,0.0,11.405152,9.273306,52.477727,0.183333,0.55
9,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,269,91.0,2008-08-01T23:49:57Z,260,4,"<p>Yes, I thought about that, but I soon figur...",159,969,6.09434,0.0,0.0,169,1190,7.04142,2.0,0.0,13.247945,8.761942,48.468195,0.088946,0.451701


**Genearting semantic features for Question Textbody**

In [None]:
#readability method from spacy
import spacy

from spacy_readability import Readability
nlp = spacy.load('en')
nlp.add_pipe(Readability())


Q_grade = np.zeros((samples,1))
Q_dale_chall = np.zeros((samples,1))
Q_ease = np.zeros((samples,1))
Q_p =np.zeros((samples,1))
Q_s =np.zeros((samples,1))

for x in range(samples):
  answer_text = preprocessString(questions_answers.iloc[x,3])
  docs = nlp(answer_text)
  Q_grade[x] =docs._.flesch_kincaid_grade_level
  Q_dale_chall[x]= docs._.dale_chall
  Q_ease[x]= docs._.flesch_kincaid_reading_ease
  #semantic analysis from textblob
  blob = TextBlob(questions_answers.iloc[x,9])
  Q_p[x] = blob.sentiment.polarity
  Q_s[x] = blob.sentiment.subjectivity

In [None]:

questions_answers['Q_GradeLevel']=grade       #https://en.wikipedia.org/wiki/Readability#The_Flesch_formulas
questions_answers['Q_Dale_chall']=dale_chall  #https://en.wikipedia.org/wiki/Readability#The_Dale%E2%80%93Chall_formula
questions_answers['Q_ReadingEase']=ease

questions_answers['Q_polarity']=p
questions_answers['Q_subjectivity']=s

questions_answers.head(5)

Unnamed: 0,QId,QScore,QTitle,QBody,AId,AOwnerUserID,ACreationDate,ParentId,AScore,ABody,word_cnt,char_cnt,avg_char_length,urls,CodeSections,Q_word_cnt,Q_char_cnt,Q_avg_char_length,Q_urls,Q_CodeSections,GradeLevel,Dale_chall,ReadingEase,polarity,subjectivity,Q_GradeLevel,Q_Dale_chall,Q_ReadingEase,Q_polarity,Q_subjectivity
0,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,199,50.0,2008-08-01T19:36:46Z,180,1,<p>I've read somewhere the human eye can't dis...,279,1674,6.0,0.0,2.0,41,260,6.341463,0.0,0.0,3.168889,10.894532,87.911453,-0.132211,0.451375,3.168889,10.894532,87.911453,-0.132211,0.451375
1,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,529,86.0,2008-08-02T18:16:07Z,180,3,<p>Isn't it also a factor which order you set ...,52,275,5.288462,0.0,0.0,41,260,6.341463,0.0,0.0,2.216136,7.189409,103.664205,0.166667,0.5,2.216136,7.189409,103.664205,0.166667,0.5
2,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,539,157.0,2008-08-02T19:03:52Z,180,21,"<p>My first thought on this is ""how generate N...",190,1308,6.884211,5.0,0.0,41,260,6.341463,0.0,0.0,7.489091,8.396844,71.324545,0.138636,0.416667,7.489091,8.396844,71.324545,0.138636,0.416667
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,59760,5845.0,2008-09-12T19:00:13Z,180,17,<p>It would be best to find colors maximally d...,55,370,6.727273,0.0,0.0,41,260,6.341463,0.0,0.0,16.804091,12.464864,23.566136,0.1,0.27,16.804091,12.464864,23.566136,0.1,0.27
4,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,93908,16632.0,2008-09-18T16:01:24Z,180,7,"<p>Some related resources:</p>\n\n<p><a href=""...",46,467,10.152174,4.0,0.0,41,260,6.341463,0.0,0.0,12.963913,11.605512,29.428188,0.233333,0.366667,12.963913,11.605512,29.428188,0.233333,0.366667


In [None]:
questions_answers.to_csv("feature_extract_Tony_with_Q.csv")
from google.colab import files
files.download("feature_extract_Tony_with_Q.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Feature extraction CSV files are then joined together, called Train.csv, for machine learning model trainings in Model_Training.ipynb**