In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# PseudoCode and Task List

1. Load the pickled pandas dataframe from 02 notebook 
2. Examine contents to assure everything transferred properly - no issues found
3. Examine the feature 'Body' which are the questions to gain statistical insights and identify cleaning tasks needed
4.  Get counts of certain special characters to add to features prior to cleaning:
>4a. Questions marks
>4b. Text bolding
>4c. Number of paragraphs
>4d. Code examples
5. Cleaning tasks identified - (am skipping over mispellings because these will come out in the wash eventually)
>5a. Remove code snippets
>5b. Remove html formatting
>5c. Expand contractions
>5d. Language detection to make sure everything is in English
>5e. Remove special characters
>5f. Simple Lemmatization
>5g. Named Entity Recognition with Spacy
>5h. POS tagging
>5i. Convert to lowercase
>5j. Remove stop words
6. Export the dataframe with cleaned text and features for further analysis 


# Tasks 1 and 2 Load file and examine contents

In [None]:
'''
Install required modules
'''
!pip install contractions
!pip install fasttext



In [None]:
'''
Import all modules that are needed
'''
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader.wordnet import VERB, NOUN, ADJ, ADV
from nltk import StanfordTagger
import contractions
from contractions import contractions_dict
import fasttext
import collections
import unicodedata
import textwrap
import string
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [None]:
'''
Download all nltk tools
'''
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
'''
1. Load the pickled pandas dataframe 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24353 entries, 0 to 24352
Data columns (total 40 columns):
 #   Column               Non-Null Count  Dtype          
---  ------               --------------  -----          
 0   Id                   24353 non-null  int64          
 1   PostTypeId           24353 non-null  int64          
 2   CreationDate         24353 non-null  datetime64[ns] 
 3   Score                24353 non-null  int64          
 4   ViewCount            24353 non-null  int64          
 5   Body                 24353 non-null  object         
 6   OwnerUserId          24238 non-null  object         
 7   LastActivityDate     24353 non-null  datetime64[ns] 
 8   Title                24353 non-null  object         
 9   Tags                 24353 non-null  object         
 10  AnswerCount          24353 non-null  int64          
 11  CommentCount         24353 non-null  int64          
 12  FavoriteCount        6708 non-null   object         
 13  ClosedDate           1416 non-null   datetime64[ns] 
 14  ContentLicense       24353 non-null  object         
 15  Tags_SpaceDelimited  24353 non-null  object         
 16  Tags_Clean           24353 non-null  object         
 17  TagCount             24353 non-null  int64          
 18  Tag1                 24353 non-null  object         
 19  Tag2                 21064 non-null  object         
 20  Tag3                 15037 non-null  object         
 21  Tag4                 8302 non-null   object         
 22  Tag5                 3687 non-null   object         
 23  Tag1_Freq            24353 non-null  int64          
 24  Tag2_Freq            21064 non-null  float64        
 25  Tag3_Freq            15037 non-null  float64        
 26  Tag4_Freq            8302 non-null   float64        
 27  Tag5_Freq            3687 non-null   float64        
 28  Total_Tag_Freqency   24353 non-null  float64        
 29  Tag1_Renamed         24353 non-null  object         
 30  Tag2_Renamed         24353 non-null  object         
 31  Tag3_Renamed         24353 non-null  object         
 32  Tag4_Renamed         24353 non-null  object         
 33  Tag5_Renamed         24353 non-null  object         
 34  TopTag               24353 non-null  category       
 35  Elapsed_Time         24353 non-null  timedelta64[ns]
 36  Elapsed_Time_Int     24353 non-null  int16          
 37  rank                 24353 non-null  int64          
 38  Tag1_Renamed2        24353 non-null  object         
 39  TopTag_Revised       24353 non-null  int64     
'''

questions_df = pd.read_pickle('/content/drive/My Drive/Capstone2/Data/questions_df_09252020.pickle')

questions_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24353 entries, 0 to 24352
Data columns (total 40 columns):
 #   Column               Non-Null Count  Dtype          
---  ------               --------------  -----          
 0   Id                   24353 non-null  int64          
 1   PostTypeId           24353 non-null  int64          
 2   CreationDate         24353 non-null  datetime64[ns] 
 3   Score                24353 non-null  int64          
 4   ViewCount            24353 non-null  int64          
 5   Body                 24353 non-null  object         
 6   OwnerUserId          24238 non-null  object         
 7   LastActivityDate     24353 non-null  datetime64[ns] 
 8   Title                24353 non-null  object         
 9   Tags                 24353 non-null  object         
 10  AnswerCount          24353 non-null  int64          
 11  CommentCount         24353 non-null  int64          
 12  FavoriteCount        6708 non-null   object         
 13  ClosedDate      

In [None]:
'''
2. Examine contents
'''
questions_df.head()

Unnamed: 0,Id,PostTypeId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,ContentLicense,Tags_SpaceDelimited,Tags_Clean,TagCount,Tag1,Tag2,Tag3,Tag4,Tag5,Tag1_Freq,Tag2_Freq,Tag3_Freq,Tag4_Freq,Tag5_Freq,Total_Tag_Freqency,Tag1_Renamed,Tag2_Renamed,Tag3_Renamed,Tag4_Renamed,Tag5_Renamed,TopTag,Elapsed_Time,Elapsed_Time_Int,rank,Tag1_Renamed2,TopTag_Revised
0,5,1,2014-05-13 23:58:30.457,9,708,<p>I've always been interested in machine lear...,5,2014-05-14 00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1,1,1.0,2014-05-14 14:40:25.950,CC BY-SA 3.0,machine-learning,[machine-learning],1,machine-learning,,,,,7766,,,,,7766.0,machine-learning,,,,,1,0 days 00:38:00.620000,0,21642,machine-learning,1
1,7,1,2014-05-14 00:11:06.457,4,441,"<p>As a researcher and instructor, I'm looking...",36,2014-05-16 13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3,4,1.0,2014-05-14 08:40:54.950,CC BY-SA 3.0,education open-source,"[education, open-source]",2,education,open-source,,,,33,16.0,,,,49.0,Other,Other,,,,0,2 days 13:33:53.780000,2,16792,Other,0
2,14,1,2014-05-14 01:25:59.677,22,1717,<p>I am sure data science as will be discussed...,66,2014-06-20 17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4,1,6.0,NaT,CC BY-SA 3.0,data-mining definitions,"[data-mining, definitions]",2,data-mining,definitions,,,,1005,31.0,,,,1036.0,data-mining,Other,,,,1,37 days 16:10:05.346000,37,15799,data-mining,1
3,15,1,2014-05-14 01:41:23.110,2,643,<p>In which situations would one system be pre...,64,2014-05-14 01:41:23.110,What are the advantages and disadvantages of S...,<databases>,0,1,,2014-05-14 07:41:49.437,CC BY-SA 3.0,databases,[databases],1,databases,,,,,89,,,,,89.0,Other,,,,,0,0 days 00:00:00,0,21681,Other,0
4,16,1,2014-05-14 01:57:56.880,17,382,"<p>I use <a href=""http://www.csie.ntu.edu.tw/~...",63,2014-05-17 16:24:14.523,Use liblinear on big data for semantic analysis,<machine-learning><bigdata><libsvm>,2,0,,NaT,CC BY-SA 3.0,machine-learning bigdata libsvm,"[machine-learning, bigdata, libsvm]",3,machine-learning,bigdata,libsvm,,,7766,433.0,14.0,,,8213.0,machine-learning,Other,Other,,,1,3 days 14:26:17.643000,3,10411,machine-learning,1


# Task 3: Examine 'Body' Feature 
Examine the feature 'Body' which are the questions to gain statistical insights and identify cleaning tasks needed

In [None]:
'''
3. Examine the feature 'Body' which are the questions and identify cleaning tasks needed; 
first show the Top Tag questions with highest and lowest rank

Notice that the lower ranked questions are oftentimes shorter, simpler, and specific to one narrow topic rather than the higher ranked questions 
which tend to be broader or higher level questions
(correlated with TagCount)

4 of the top 5 ranked questions are about machine-learning

The number of questions asked is variable, as is the length of the question - in some instances there is not really a question but a statement 

So a count of question marks in the question body and length of the question body are good features to add and analyze

Cleaning items include html / xml formatting removal, special character removal, and expanding contractions
Also there are many code snippets and hyperlinks that should probably be removed
Certain acronyms like LDA, PCA, SGD are important to recognize - in a future step, must use POS tagging to identify these
'''
toptagQuestions = questions_df.loc[questions_df["TopTag_Revised"] == 1]
toptagrank = list(zip(toptagQuestions["Id"],toptagQuestions["rank"],toptagQuestions["Tags_SpaceDelimited"],toptagQuestions["Title"],toptagQuestions["Body"]))
toptagrank.sort(key=lambda x: x[1],reverse=False)

for id,r,tag,t,b in toptagrank[:10]:
    print("HighestRank", "*" * 148, '\n')
    print("Question id:",id)
    print("Rank :",r)
    print("Question Tags\t:",tag) 
    print("Question Title\t:",t) 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')
for id,r,tag,t,b in toptagrank[-10:]:
    print("LowestRank", "*" * 143, '\n')
    print("Question id:",id)
    print("Rank :",r)
    print("Question Tags\t:",tag) 
    print("Question Title\t:",t) 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')

HighestRank **************************************************************************************************************************************************** 

Question id: 22
Rank : 1
Question Tags	: data-mining clustering octave k-means categorical-data
Question Title	: K-Means clustering for mixed numeric and categorical data
Question Body	:

<p>My data set contains a number of numeric attributes and one categorical.</p>  <p>Say, <code>NumericAttr1, NumericAttr2, ..., NumericAttrN,
CategoricalAttr</code>, </p>  <p>where <code>CategoricalAttr</code> takes one of three possible values: <code>CategoricalAttrValue1</code>,
<code>CategoricalAttrValue2</code> or <code>CategoricalAttrValue3</code>.</p>  <p>I'm using default k-means clustering algorithm implementation for Octave <a
href="https://blog.west.uni-koblenz.de/2012-07-14/a-working-k-means-code-for-octave/">https://blog.west.uni-koblenz.de/2012-07-14/a-working-k-means-code-for-
octave/</a>. It works with numeric data only.</p>  

In [None]:
'''
3a. Now look at questions with less frequent tags in the same fashion (not on TopTag List)

Additional cleaning items - what to do about mispellings - "bot" used instead of "but", but "bot" is an important word in Data science
'''
btmtagQuestions = questions_df.loc[questions_df["TopTag_Revised"] == 0]

btmtagvcrank = list(zip(btmtagQuestions["Id"],btmtagQuestions["rank"],btmtagQuestions["Tags_SpaceDelimited"],btmtagQuestions["Title"],btmtagQuestions["Body"]))
btmtagvcrank.sort(key=lambda x: x[1],reverse=False)

for id,r,tag,t,b in btmtagvcrank[:10]:
    print("HighestRank", "*" * 125, '\n')
    print("Question id:",id)
    print("Rank:",r)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t) 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')
for id,r,tag,t,b in btmtagvcrank[-10:]:
    print("LowestRank", "*" * 126, '\n')
    print("Question id:",id)
    print("Rank:",r)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')

HighestRank ***************************************************************************************************************************** 

Question id: 29851
Rank: 105
Question Tags	: data-cleaning preprocessing word-embeddings encoding embeddings 

Question Title	: One Hot Encoding vs Word Embeding - When to choose one or another?
Question Body	:

<p>A colleague of mine is having an interesting situation, he has quite a large set of possibilities for a defined categorical feature (+/- 300 different
values)</p>  <p>The usual data science approach would be to perform a One-Hot Encoding. However, wouldn't it be a bit extreme to perform some One-Hot Encoding
with a dictionary quite large (+/- 300 values) ? Is there any best practice on when to choose Embedding vectors or One-Hot Encoding ?</p>  <hr>  <p>Additional,
information: how would you handle the previous case if new values can be added to the dictionary. Re-training seems the only solution, however with One-Hot
Encoding, the data 

# Task 4 Get counts of certain special characters from body
Get counts of certain special characters to add to features prior to cleaning: 4a. Questions marks 4b. Text bolding 4c. Number of paragraphs 4d. Code examples

In [None]:
'''
4. The following insights have been gained in reviewing some of the questions:
there may be a correlation between number of questions asked ("?"), bolding of text <strong>, number of paragraphs <p>, and code examples <code> 
in the body of the text; let's get a count of these prior to cleaning and removing them and keep them as features

4a. Starting with count of questions - mean is 1.5 - some have no questions at all; max is 58 wow; looking at it the high question count comes from
an error log that was provided as part of the question, so this one will probably show up with a high character and word count in the body, as well 
as a hit for a long code example
'''
questions_df["NumQuestions"] = questions_df["Body"].map(lambda x: str.count(x, '?'))
#questions_df.head()
#questions_df["NumQuestions"].describe()
maxquestion = questions_df.loc[questions_df["NumQuestions"] == 58]

maxquestionlist = list(zip(maxquestion["Id"],maxquestion["Tags_SpaceDelimited"],maxquestion["Title"],maxquestion["Body"]))
                            
for id,tag,t,b in maxquestionlist[0:]:
    print("Question id:",id)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 

Question id: 74666
Question Tags	: tensorflow gpu generative-models 

Question Title	: Is it possible to train stylegan2 with a custom dataset using a graphics card that only has 6GB of VRAM (GeForce GTX 1660)? 

Question Body	:

<p>I'm attempting to train <a href="https://github.com/NVlabs/stylegan2" rel="nofollow noreferrer">stylegan2</a> using a custom dataset, but no matter what
settings I use I see the same error:</p>  <pre><code>2020-05-22 11:15:05.261933: W tensorflow/core/common_runtime/bfc_allocator.cc:305] Garbage collection:
deallocate free memory regions (i.e., allocations) so that we can re-allocate a larger region to avoid OOM due to memory fragmentation. If you see this message
frequently, you are running near the threshold of the available device memory and re-allocation may incur great performance overhead. You may try smaller batch
sizes to observe the performance impact. Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to disable this feature. 2020-05-22 11:1

In [None]:
'''
4b. Next lets get a count of bolded text <strong> 
mean is .4 which means that most questions do not have bold text; max is 42 wow; 

looking at this one question poser has accentuated distinctions in his variable name examples and highlighted his main question at the end;
does use of these bold text items help to predict tags at all? we shall see
'''
questions_df["BodyBoldCount"] = questions_df["Body"].map(lambda x: str.count(x, '<strong>'))
#questions_df.head()
#questions_df["BodyBoldCount"].describe()
boldtextquestions = questions_df.loc[questions_df["BodyBoldCount"] == 42]

boldtextquestionslist = list(zip(boldtextquestions["Id"],boldtextquestions["Tags_SpaceDelimited"],boldtextquestions["Title"],boldtextquestions["Body"]))
                            
for id,tag,t,b in boldtextquestionslist[0:]:
    print("Question id:",id)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 

Question id: 5367
Question Tags	: machine-learning predictive-modeling ranking 

Question Title	: Best way to format data for supervised machine learning ranking predictions 

Question Body	:

<p>I'm fairly new to machine learning, but I'm doing my best to learn as much as possible.</p>  <p>I am curious about how predicting athlete performance
(runners in particular) in a race of a specific starting lineup. For instance, if RunnerA, RunnerB, RunnerC, and RunnerD are all racing a 400 meter race, I want
to best predict whether <strong>RunnerA</strong> will beat <strong>RunnerB</strong> based on past race result information (which I have at my disposal).
However, I have many cases where <strong>RunnerA</strong> has never raced against <strong>RunnerB</strong>; yet I do have data showing <strong>RunnerA</strong>
has beat <strong>RunnerC</strong> in the past, and <strong>RunnerC</strong> has beat <strong>RunnerB</strong> in the past. This logic extends deeper as well.
So, it would seem that

In [None]:
'''
4c. Next lets get a count of number of paragraphs in each question body 
mean is 4, some have no paragraphs whatsoever; most have 2-6 paragraphs, and max is 57 wow; 

let's look at examples of zero and the 57 one; 
those without any paragraphs are limited to either code or lines without paragraphs; 
in the one with 57 the question poser has formatted his code examples
into paragraphs, thus accounting for the high count.
'''
questions_df["ParagraphCount"] = questions_df["Body"].map(lambda x: str.count(x, '<p>'))
#questions_df.head()
#questions_df["ParagraphCount"].describe()
lst = [0,57]
paragraphquestions = questions_df.loc[questions_df["ParagraphCount"].isin(lst)].sort_values(by = "ParagraphCount", ascending=False).head()
#paragraphquestions.head()

paragraphcountlist = list(zip(paragraphquestions["Id"],paragraphquestions["Tags_SpaceDelimited"],paragraphquestions["Title"],paragraphquestions["Body"]))
                            
for id,tag,t,b in paragraphcountlist[0:]:
    print("Examples of Paragraph Counts", "*" * 131, '\n')
    print("Question id:",id)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')

Examples of Paragraph Counts *********************************************************************************************************************************** 

Question id: 41795
Question Tags	: cnn 

Question Title	: Value of loss and accuracy does not change over Epochs 

Question Body	:

<p>I am trying to work on a CNN model for churn. Here is my code. NO matter what optimizer I choose, change the learning rate, learning decay, loss function
etc, losses and accuracy do not change over epoch. I am feeding following array as input to model, which are encoded ( label encoded and then CSC) x_train.shape
=  (27999, 1, 500, 10) y_train.shape =  (27999,) x_test.shape =  (57540, 1, 500, 10) y_test.shape =  (57540,)</p>  <p>Original input is a CSV file of shape
(28770155, 11)</p>  <p>Code is as follows:</p>  <h2>label Encoder</h2>  <p>lab=LabelEncoder()</p>  <p>lab1=LabelEncoder()</p>
<p>train_label=train_table.apply(lab.fit_transform)</p>  <p>test_label=test_table.apply(lab1.fit_transfor

In [None]:
'''
4d. Next lets get a count of number of code examples in the question 

mean is 1 code example per question, max is 36 wow; let's look at the 36 ones - there are 2; 

yup lots of code examples in those two, but in the first one it looks like the code should have been
formatted differently, since it is a powerbi question....; as we would expect most of the time, 
where code examples are given tags are specific to a programming language; this would be something to explore
'''
questions_df["CodeCount"] = questions_df["Body"].map(lambda x: str.count(x, '<code>'))
#questions_df.head()
questions_df["CodeCount"].describe()

codeinquestions = questions_df.loc[questions_df["CodeCount"] == 36]
#codeinquestions.head()
codeinquestionslist = list(zip(codeinquestions["Id"],codeinquestions["Tags_SpaceDelimited"],codeinquestions["Title"],codeinquestions["Body"]))
                            
for id,tag,t,b in codeinquestionslist[0:]:
    print("Examples of High Code Counts", "*" * 131, '\n')
    print("Question id:",id)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')

Examples of High Code Counts *********************************************************************************************************************************** 

Question id: 54035
Question Tags	: powerbi 

Question Title	: (SOLVED) Power BI, Page level filter not working with many to one relation 

Question Body	:

<p>EDIT: Solution found.</p>  <p>Turns out the relations were set to <code>Cross filter direction: Single</code> on all relations (and the one from <code>Month
table.Month</code> to <code>A.Month</code> was not set to active).</p>  <p>Setting <code>Cross filter direction: Both</code> fixed the problem.</p>  <hr>  <p>I
have four tables <code>A</code>, <code>B</code>, <code>Month table</code> and <code>Quarter table</code></p>  <p><code>A</code> has a column
<code>A.Month</code>, and many other columns with irrelevant data</p>  <p><code>B</code> has a column <code>B.Quarter</code>, and many other columns with
irrelevant data</p>  <p><code>Month table</code> has columns <code

# Task 5 Clean Body Text - Mispellings will Come out in the Wash
>5a. Remove code snippets
>5b. Remove html formatting
>5c. Expand contractions
>5d. Language detection to make sure everything is in English
>5e. Remove special characters
>5f. Simple Lemmatization
>5g. Named Entity Recognition with Spacy
>5h. POS tagging
>5i. Convert to lowercase
>5j. Remove stop words

In [None]:
'''
5. Now we are ready to clean; from the examples above we see the following cleaning tasks are required:
5a. Remove code snippets using Beautiful Soup
'''
questions_df["Soup"] = [BeautifulSoup(text,'lxml') for text in questions_df["Body"]]

def remove_code(soup):
  for tag in soup.find_all('code'):
    tag.replaceWith('')
  return soup
    
questions_df['NoCode'] = questions_df["Soup"].apply(remove_code)

# QC of result with question identified above

codeinquestions = questions_df.loc[questions_df["Id"] == 55690]

codeinquestionslist = list(zip(codeinquestions["Id"],codeinquestions["Tags_SpaceDelimited"],codeinquestions["Title"],codeinquestions["Body"],codeinquestions["NoCode"]))
                            
for id,tag,t,b,nc in codeinquestionslist[0:]:
    print("Examples of High Code Counts", "*" * 131, '\n')
    print("Question id:",id)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')
    print("Question Body WO Code\t:",nc, '\n')


Examples of High Code Counts *********************************************************************************************************************************** 

Question id: 55690
Question Tags	: python dataset bigdata numpy 

Question Title	: How to do numpy matmul broadcasting between two numpy tensors? 

Question Body	:

<p>I have the Pauli matrices which are (2x2) and complex</p>  <pre class="lang-py prettyprint-override"><code>II = np.identity(2, dtype=complex) X =
np.array([[0, 1], [1, 0]], dtype=complex) Y = np.array([[0, -1j], [1j, 0]], dtype=complex) Z = np.array([[1, 0], [0, -1]], dtype=complex) </code></pre>  <p>and
a <code>depolarizing_error</code> function which takes in a normally distributed random number <code>param</code>, generated by
<code>np.random.normal(noise_mean, noise_sd)</code></p>  <pre class="lang-py prettyprint-override"><code>def depolarizing_error(param):     XYZ =
np.sqrt(param/3)*np.array([X, Y, Z])     return np.array([np.sqrt(1-param)*II, XYZ[0], XY

In [None]:
'''
>5b. Remove html formatting

Starting with stripping html formatting - using Beautiful Soup -  results look good!
'''
# Must first convert results from above back to text string

def convert_string(soup):
  for text in soup.find_all():
    return str(text)

questions_df["NoCodeString"] = questions_df["NoCode"].apply(convert_string)

#Then get text
questions_df["BodyText"] = [BeautifulSoup(text,'lxml').get_text() for text in questions_df["NoCodeString"]]

codeinquestions = questions_df.loc[questions_df["Id"] == 55690]

codeinquestionslist = list(zip(codeinquestions["Id"],codeinquestions["Tags_SpaceDelimited"],codeinquestions["Title"],codeinquestions["Body"],codeinquestions["BodyText"]))
                            
for id,tag,t,b,bt in codeinquestionslist[0:]:
    print("Examples of High Code Counts", "*" * 131, '\n')
    print("Question id:",id)
    print("Question Tags\t:",tag,'\n') 
    print("Question Title\t:",t,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(b, 160)) 
    print("*" * 160,'\n')
    print("Question Body WO Code\t:" '\n')
    print(textwrap.fill(bt, 160)) 
    print("*" * 160,'\n')

Examples of High Code Counts *********************************************************************************************************************************** 

Question id: 55690
Question Tags	: python dataset bigdata numpy 

Question Title	: How to do numpy matmul broadcasting between two numpy tensors? 

Question Body	:

<p>I have the Pauli matrices which are (2x2) and complex</p>  <pre class="lang-py prettyprint-override"><code>II = np.identity(2, dtype=complex) X =
np.array([[0, 1], [1, 0]], dtype=complex) Y = np.array([[0, -1j], [1j, 0]], dtype=complex) Z = np.array([[1, 0], [0, -1]], dtype=complex) </code></pre>  <p>and
a <code>depolarizing_error</code> function which takes in a normally distributed random number <code>param</code>, generated by
<code>np.random.normal(noise_mean, noise_sd)</code></p>  <pre class="lang-py prettyprint-override"><code>def depolarizing_error(param):     XYZ =
np.sqrt(param/3)*np.array([X, Y, Z])     return np.array([np.sqrt(1-param)*II, XYZ[0], XY

In [None]:
'''
5c. Expand contractions using contractions module and associated dictionary; process is to split on words then rejoin after contractions are expanded
'''
questions_df['bodytext_expanded'] = questions_df['BodyText'].apply(lambda x: [contractions.fix(word) for word in x.split()])
questions_df['bodytext_expanded'] = questions_df['bodytext_expanded'].str.join(" ")

print("Before: ", textwrap.fill(questions_df['BodyText'][1], 160))
print('\n')
print("After: ", textwrap.fill(questions_df['bodytext_expanded'][1], 160))

Before:  As a researcher and instructor, I'm looking for open-source books (or similar materials) that provide a relatively thorough overview of data science from an
applied perspective. To be clear, I'm especially interested in a thorough overview that provides material suitable for a college-level course, not particular
pieces or papers.


After:  As a researcher and instructor, I am looking for open-source books (or similar materials) that provide a relatively thorough overview of data science from an
applied perspective. To be clear, I am especially interested in a thorough overview that provides material suitable for a college-level course, not particular
pieces or papers.


In [None]:
'''
5d. Check to see if all questions are in English; Using Facebook's fasttext library and their prebuilt model

A few rows come back as a different language than english (fr,ja,kn), a review of these rows indicates that the
questions are reduced to almost nothing with the removal of the code snippets producing erroneous results;
I have found that the removal of the code snippets produces a more consistent result of english whereas before code removal
other languages were detected where string examples within the question body text were in an alternate language.  See the examples below in
the first idlst
Good catch Facebook! 
Bottomline is we do not need to exclude any rows due to a different language being used.
'''
pretrained_model = '/content/drive/My Drive/Capstone2/Data/lid.176.bin'
model = fasttext.load_model(pretrained_model)
langs = []
for sent in questions_df['bodytext_expanded']:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
questions_df['lang'] = langs

lang_df = questions_df[['Id','bodytext_expanded','lang']].groupby(by='lang').count()
#print(lang_df.head(12))

questions_df[questions_df['lang'] != 'en']

#idlst = [224,53950,55166,75109]
idlst = [32040,61424,66366]

#lang_sc_questions = questions_df.loc[questions_df["Id"].isin(idlst)]
#lang_sc_questions.head()
#langsc_quest_lst = list(zip(lang_sc_questions["Id"],lang_sc_questions["lang"],lang_sc_questions["Body"],lang_sc_questions["BodyText"],lang_sc_questions['bodytext_expanded']))
          
#for id,l,b,btext,bexp in langsc_quest_lst[0:]:
#    print("Examples of Non-English Language", "*" * 127, '\n')
#    print("Question id\t:",id)
#    print("Language\t:",l)
#    print("Question Body1\t:" '\n')
#    print(textwrap.fill(b, 160)) 
#    print("*" * 160,'\n')
#    print("Question Body2\t:" '\n')
#    print(textwrap.fill(btext, 160)) 
#    print("*" * 160,'\n')
#    print("Question Body3\t:" '\n')
#    print(textwrap.fill(bexp, 160)) 
#    print("*" * 160,'\n')



In [None]:
'''
5e. Remove special characters and punctuation; first let's take a look at what special characters we have in the body text;
then remove all that we need to; 
'''
nopunct = string.ascii_letters+string.digits+string.whitespace

def stripalphanum(InputString):
    return " ".join([ch for ch in InputString if ch not in (nopunct)])
      
questions_df['punct_only'] = questions_df['bodytext_expanded'].apply(stripalphanum)

print("Full Text: ", textwrap.fill(questions_df['bodytext_expanded'][2], 160))
print('\n')
print("Special Characters: ", questions_df['punct_only'][2])

Full Text:  I am sure data science as will be discussed in this forum has several synonyms or at least related fields where large data is analyzed. My particular question
is in regards to Data Mining. I took a graduate class in Data Mining a few years back. What are the differences between Data Science and Data Mining and in
particular what more would I need to look at to become proficient in Data Mining?


Special Characters:  . . . ?


In [None]:
'''
5e. Task continued - analyzing and removing special characters
Let's get a list of all the Special Characters and Their Frequency Counts and save to a new dataframe

The most common special characters are typical and expected, but also due to the different language examples identified above in a 
handful of the questions, we see accented letters, chinese and arabic characters too.

Also note that there are Greek letters, theta, alpha, and beta that are pertinent to data science and that should be left in place also.
I am choosing to remove all of the common characters and leave the exceptions noted in these comments.
'''
all_specchar = [item for sublist in questions_df['punct_only'].values for item in sublist]
my_set = set(all_specchar)
unique_specchar = list(my_set)

specchar_freq = collections.Counter(all_specchar)

kk=[list(specchar_freq.keys()),list(specchar_freq.values())]

specchar_freq_df = pd.DataFrame(np.array(kk).T, columns=['SpecChar','SpecChar_Freq'])
specchar_freq_df['SpecChar_Freq'] = pd.to_numeric(specchar_freq_df['SpecChar_Freq'])

# Using this code to make the output clear
print("There are a total of {} special characters this dataset. \n".format(len(all_specchar)))

print("There are {} unique special characters in this dataset. \n".format(len(unique_specchar)))

print("Here is a list of the top 50 special characters sorted by frequency: \n")
specchar_freq_df['SpecChar_Freq'] = pd.to_numeric(specchar_freq_df['SpecChar_Freq'])
specchar_freq_df.sort_values(by='SpecChar_Freq', ascending = False).head(60)

There are a total of 1263953 special characters this dataset. 

There are 329 unique special characters in this dataset. 

Here is a list of the top 50 special characters sorted by frequency: 



Unnamed: 0,SpecChar,SpecChar_Freq
1,,619827
5,.,149521
0,",",115965
7,),42474
6,(,40843
3,-,39317
4,?,35830
9,:,31775
31,$,27289
19,_,21342


In [None]:
'''
5e. Let's convert the accented characters prior to removing the other special characters

Interestingly, the Chinese characters have been removed through this process too;
I guess that is ok.
'''
def remove_accented_chars(text):
    noaccent_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return noaccent_text

questions_df['bodytext_noaccents'] = questions_df['bodytext_expanded'].apply(remove_accented_chars)

lang_sc_questions_new = questions_df.loc[questions_df["Id"].isin(idlst)]

langsc_quest_lst_new = list(zip(lang_sc_questions_new["Id"],lang_sc_questions_new["lang"],lang_sc_questions_new["bodytext_expanded"],lang_sc_questions_new["bodytext_noaccents"]))
          
for id,l,btext,na in langsc_quest_lst_new[0:]:
    print("Examples of Non-English Language", "*" * 127, '\n')
    print("Question id:",id)
    print("Question Language\t:",l,'\n') 
    print("Question Body\t:" '\n')
    print(textwrap.fill(btext, 160)) 
    print("Question Body With Accents Removed\t:" '\n')
    print(textwrap.fill(na, 160)) 
    print("*" * 160,'\n')


Examples of Non-English Language ******************************************************************************************************************************* 

Question id: 32040
Question Language	: fr 

Question Body	:

R code : OUTPUT :
Question Body With Accents Removed	:

R code : OUTPUT :
**************************************************************************************************************************************************************** 

Examples of Non-English Language ******************************************************************************************************************************* 

Question id: 61424
Question Language	: ja 

Question Body	:

ERROR:
Question Body With Accents Removed	:

ERROR:
**************************************************************************************************************************************************************** 

Examples of Non-English Language *******************************************************************

In [None]:
'''
5e. Choosing to remove only the most common special characters, leaving remaining foreign language and greek characters in place 
'''
punct = '!"#$%&\'()*+,./:;-<=>?@[\\]^_`{|}~'

def strip_spec_char(InputString):
    return "".join([ch for ch in InputString if ch not in (punct)])
    
questions_df['bodytext_nospch'] = questions_df['bodytext_noaccents'].apply(strip_spec_char)
#questions_df.head()

print("Before: ", textwrap.fill(questions_df['bodytext_noaccents'][3], 160))
print('\n')
print("After: ", textwrap.fill(questions_df['bodytext_nospch'][3], 160))

Before:  In which situations would one system be preferred over the other? What are the relative advantages and disadvantages of relational databases versus non-
relational databases?


After:  In which situations would one system be preferred over the other What are the relative advantages and disadvantages of relational databases versus nonrelational
databases


In [None]:
'''
5f Simple Lemmatizing 
'''
lm = WordNetLemmatizer()

def lem_vrbs(text):
  return ' '.join([lm.lemmatize(w,'v') for w in word_tokenize(text)])

questions_df['temp'] = questions_df['bodytext_nospch'].apply(lem_vrbs)

def lem_nouns(text):
  return ' '.join([lm.lemmatize(w,'n') for w in word_tokenize(text)])

questions_df['temp2'] = questions_df['temp'].apply(lem_nouns)

def lem_adj(text):
  return ' '.join([lm.lemmatize(w,'a') for w in word_tokenize(text)])

questions_df['temp3'] = questions_df['temp2'].apply(lem_adj)

def lem_adv(text):
  return ' '.join([lm.lemmatize(w,'r') for w in word_tokenize(text)])

questions_df['BodyText_Lemma'] = questions_df['temp3'].apply(lem_adv)

print("Before Special Character Removal: ")
print(textwrap.fill(questions_df['bodytext_noaccents'][8], 160))
print('\n')
print("Before Lemmatization: ")
print(textwrap.fill(questions_df['bodytext_nospch'][8], 160))
print('\n')
print("After: ") 
print(textwrap.fill(questions_df['BodyText_Lemma'][8], 160))

questions_df.drop(axis = 1, labels = ['temp','temp2','temp3'])

Before Special Character Removal: 
I have a bunch of customer profiles stored in a elasticsearch cluster. These profiles are now used for creation of target groups for our email subscriptions.
Target groups are now formed manually using elasticsearch faceted search capabilities (like get all male customers of age 23 with one car and 3 children). How
could I search for interesting groups automatically - using data science, machine learning, clustering or something else? r programming language seems to be a
good tool for this task, but I can not form a methodology of such group search. One solution is to somehow find the largest clusters of customers and use them
as target groups, so the question is: How can I automatically choose largest clusters of similar customers (similar by parameters that I do not know at this
moment)? For example: my program will connect to elasticsearch, offload customer data to CSV and using R language script will find that large portion of
customers are male w

Unnamed: 0,Id,PostTypeId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,ContentLicense,Tags_SpaceDelimited,Tags_Clean,TagCount,Tag1,Tag2,Tag3,Tag4,Tag5,Tag1_Freq,Tag2_Freq,Tag3_Freq,Tag4_Freq,Tag5_Freq,Total_Tag_Freqency,Tag1_Renamed,Tag2_Renamed,Tag3_Renamed,Tag4_Renamed,Tag5_Renamed,TopTag,Elapsed_Time,Elapsed_Time_Int,rank,Tag1_Renamed2,TopTag_Revised,NumQuestions,BodyBoldCount,ParagraphCount,CodeCount,Soup,NoCode,NoCodeString,BodyText,bodytext_expanded,lang,punct_only,bodytext_noaccents,bodytext_nospch,BodyText_Lemma
0,5,1,2014-05-13 23:58:30.457,9,708,<p>I've always been interested in machine lear...,5,2014-05-14 00:36:31.077,How can I do simple machine learning without h...,<machine-learning>,1,1,1,2014-05-14 14:40:25.950,CC BY-SA 3.0,machine-learning,[machine-learning],1,machine-learning,,,,,7766,,,,,7766.0,machine-learning,,,,,1,0 days 00:38:00.620000,0,21642,machine-learning,1,2,0,3,0,[[[<p>I've always been interested in machine l...,[[[<p>I've always been interested in machine l...,<html><body><p>I've always been interested in ...,I've always been interested in machine learnin...,I have always been interested in machine learn...,en,", "" "" - - ? , "" "" , , , , , , . , , ?",I have always been interested in machine learn...,I have always been interested in machine learn...,I have always be interest in machine learn but...
1,7,1,2014-05-14 00:11:06.457,4,441,"<p>As a researcher and instructor, I'm looking...",36,2014-05-16 13:45:00.237,What open-source books (or other materials) pr...,<education><open-source>,3,4,1,2014-05-14 08:40:54.950,CC BY-SA 3.0,education open-source,"[education, open-source]",2,education,open-source,,,,33,16.0,,,,49.0,Other,Other,,,,0,2 days 13:33:53.780000,2,16792,Other,0,0,0,1,0,"[[[<p>As a researcher and instructor, I'm look...","[[[<p>As a researcher and instructor, I'm look...","<html><body><p>As a researcher and instructor,...","As a researcher and instructor, I'm looking fo...","As a researcher and instructor, I am looking f...",en,", - ( ) . , - , .","As a researcher and instructor, I am looking f...",As a researcher and instructor I am looking fo...,As a researcher and instructor I be look for o...
2,14,1,2014-05-14 01:25:59.677,22,1717,<p>I am sure data science as will be discussed...,66,2014-06-20 17:36:05.023,Is Data Science the Same as Data Mining?,<data-mining><definitions>,4,1,6,NaT,CC BY-SA 3.0,data-mining definitions,"[data-mining, definitions]",2,data-mining,definitions,,,,1005,31.0,,,,1036.0,data-mining,Other,,,,1,37 days 16:10:05.346000,37,15799,data-mining,1,1,0,2,0,[[[<p>I am sure data science as will be discus...,[[[<p>I am sure data science as will be discus...,<html><body><p>I am sure data science as will ...,I am sure data science as will be discussed in...,I am sure data science as will be discussed in...,en,. . . ?,I am sure data science as will be discussed in...,I am sure data science as will be discussed in...,I be sure data science a will be discus in thi...
3,15,1,2014-05-14 01:41:23.110,2,643,<p>In which situations would one system be pre...,64,2014-05-14 01:41:23.110,What are the advantages and disadvantages of S...,<databases>,0,1,,2014-05-14 07:41:49.437,CC BY-SA 3.0,databases,[databases],1,databases,,,,,89,,,,,89.0,Other,,,,,0,0 days 00:00:00,0,21681,Other,0,2,0,1,0,[[[<p>In which situations would one system be ...,[[[<p>In which situations would one system be ...,<html><body><p>In which situations would one s...,In which situations would one system be prefer...,In which situations would one system be prefer...,en,? - ?,In which situations would one system be prefer...,In which situations would one system be prefer...,In which situation would one system be prefer ...
4,16,1,2014-05-14 01:57:56.880,17,382,"<p>I use <a href=""http://www.csie.ntu.edu.tw/~...",63,2014-05-17 16:24:14.523,Use liblinear on big data for semantic analysis,<machine-learning><bigdata><libsvm>,2,0,,NaT,CC BY-SA 3.0,machine-learning bigdata libsvm,"[machine-learning, bigdata, libsvm]",3,machine-learning,bigdata,libsvm,,,7766,433.0,14.0,,,8213.0,machine-learning,Other,Other,,,1,3 days 14:26:17.643000,3,10411,machine-learning,1,2,6,2,0,"[[[<p>I use <a href=""http://www.csie.ntu.edu.t...","[[[<p>I use <a href=""http://www.csie.ntu.edu.t...","<html><body><p>I use <a href=""http://www.csie....",I use Libsvm to train data and predict classif...,I use Libsvm to train data and predict classif...,en,". - , - . , , . . ? ?",I use Libsvm to train data and predict classif...,I use Libsvm to train data and predict classif...,I use Libsvm to train data and predict classif...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24348,75142,1,2020-05-30 22:38:28.097,0,10,<p>I keep hearing that machine learning is jus...,96422,2020-05-30 22:38:28.097,Can all known ML algorithms be written as a se...,<linear-algebra><matrix>,0,0,,NaT,CC BY-SA 4.0,linear-algebra matrix,"[linear-algebra, matrix]",2,linear-algebra,matrix,,,,51,42.0,,,,93.0,Other,Other,,,,0,0 days 00:00:00,0,20908,Other,0,6,0,6,0,[[[<p>I keep hearing that machine learning is ...,[[[<p>I keep hearing that machine learning is ...,<html><body><p>I keep hearing that machine lea...,I keep hearing that machine learning is just l...,I keep hearing that machine learning is just l...,en,". ( ? ) , , - , , , , , , ? , . . , , , ? ? ? ?",I keep hearing that machine learning is just l...,I keep hearing that machine learning is just l...,I keep hear that machine learn be just linear ...
24349,75143,1,2020-05-30 22:41:01.187,0,5,<p>I am currently a few months into computer v...,97950,2020-05-30 22:41:01.187,What Laptop should I use?,<gpu><hardware>,0,0,,NaT,CC BY-SA 4.0,gpu hardware,"[gpu, hardware]",2,gpu,hardware,,,,116,22.0,,,,138.0,Other,Other,,,,0,0 days 00:00:00,0,21049,Other,0,2,0,12,0,[[[<p>I am currently a few months into compute...,[[[<p>I am currently a few months into compute...,<html><body><p>I am currently a few months int...,I am currently a few months into computer visi...,I am currently a few months into computer visi...,en,"/ . , . . . ? , ? . . : , € "" . . . . . . . . !",I am currently a few months into computer visi...,I am currently a few months into computer visi...,I be currently a few month into computer visio...
24350,75144,1,2020-05-30 23:50:57.707,0,3,<p>Here's the code in question. </p>\n\n<p><a ...,54721,2020-05-30 23:50:57.707,Why does the BERT NSP head linear layer have t...,<machine-learning><bert><transformer>,0,0,,NaT,CC BY-SA 4.0,machine-learning bert transformer,"[machine-learning, bert, transformer]",3,machine-learning,bert,transformer,,,7766,112.0,76.0,,,7954.0,machine-learning,Other,Other,,,1,0 days 00:00:00,0,15034,machine-learning,1,2,0,3,1,"[[[<p>Here's the code in question. </p>, \n, <...","[[[<p>Here's the code in question. </p>, \n, <...",<html><body><p>Here's the code in question. </...,Here's the code in question. \nhttps://github....,here is the code in question. https://github.c...,en,. : / / . / / / / / / / _ . # ? ?,here is the code in question. https://github.c...,here is the code in question httpsgithubcomhug...,here be the code in question httpsgithubcomhug...
24351,75145,1,2020-05-31 00:18:50.067,0,5,<p>Suppose there is a website and the decision...,39982,2020-05-31 00:35:16.450,Should I update action value functions when th...,<reinforcement-learning><markov-process>,0,0,,NaT,CC BY-SA 4.0,reinforcement-learning markov-process,"[reinforcement-learning, markov-process]",2,reinforcement-learning,markov-process,,,,473,56.0,,,,529.0,reinforcement-learning,Other,,,,1,0 days 00:16:26.383000,0,21044,reinforcement-learning,1,3,0,2,0,[[[<p>Suppose there is a website and the decis...,[[[<p>Suppose there is a website and the decis...,<html><body><p>Suppose there is a website and ...,Suppose there is a website and the decision-ma...,Suppose there is a website and the decision-ma...,en,"- . ( , , ) . , . $ $ ? , $ _ { + } = $ - ? , ...",Suppose there is a website and the decision-ma...,Suppose there is a website and the decisionmak...,Suppose there be a website and the decisionmak...


In [None]:
'''
5g. Perform Named Entity Recognition using Spacy - warning - this process takes a long time to run
not very happy with results of the NER labels for data science terms; will not include as a feature in the analysis
 '''   
# Loading in corpus

nlp = en_core_web_sm.load() 

# Testing process on one sentence
#doc = nlp('I have one hundred sentences that I would like to study for sentiment analysis. The language is Italian. Could you please provide a small example on how I could approach this problem?')
#print([(X.text, X.label_) for X in doc.ents])

# Extracting NERs and labels to new column in pandas dataframe 

#def spacy_named_entity_wlabels(c):    
#  ner = nlp(c)
#  ner2 = [[w.text,w.label_] for w in ner.ents]
#  return ner2 

#questions_df['NER'] = questions_df['BodyText_Lemma'].apply(spacy_named_entity_wlabels)

#print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][4], 160))
#print('\n')
#print("After: ", questions_df['NER'][4])

# Getting separate column with just the NER text results

#def spacy_named_entity(c):    
#  ner = nlp(c)
#  ner_text = [[w.text] for w in ner.ents]
#  return ner_text

#questions_df['NER_text'] = questions_df['BodyText_Lemma'].apply(spacy_named_entity)
# questions_df.head()

#print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][4], 160))
#print('\n')
#print("After: ", questions_df['NER_text'][4])

# Getting separate column with just the NER label results

def spacy_label(c):    
  ner = nlp(c)
  ner_label = [[w.label_] for w in ner.ents]
  return ner_label

questions_df['NER_label'] = questions_df['BodyText_Lemma'].apply(spacy_label)
#questions_df.head()

print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][4], 160))
print('\n')
print("After: ", questions_df['NER_label'][4])

Before:  I use Libsvm to train data and predict classification on semantic analysis problem But it have a performance issue on largescale data because semantic analysis
concern ndimension problem Last year Liblinear be release and it can solve performance bottleneck But it cost too much memory Is MapReduce the only way to solve
semantic analysis problem on big data Or be there any other method that can improve memory bottleneck on Liblinear


After:  [['GPE'], ['DATE'], ['EVENT'], ['PERSON']]


In [None]:
'''
5g. NER continued, further analysis
'''
questions_df.loc[questions_df['Id'] == 55166]
print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][16223], 160))
print('\n')
print("After: ", questions_df['NER'][16223])
questions_df.loc[questions_df['Id'] == 75109]
print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][24332], 160))
print('\n')
print("After: ", questions_df['NER'][24332])

Before:  How Do I interpret this summary output in R Coefficients


After:  []
Before:  I have one hundred sentence that I would like to study for sentiment analysis The language be Italian Could you please provide a small example on how I could
approach this problem For example use sentence like Many thank


After:  [['one hundred', 'CARDINAL'], ['Italian', 'NORP']]


In [None]:
'''
5g. NER contd; build dataframe with the text, label pair lists and tag to review and analyze further
And filter out cardinal labels and others that that are not needed -  future work?
''

In [None]:
'''
5h. Perform POS tagging only as a way to pull out important nouns
'''

def pos_text(text):
  is_noun = lambda pos: pos[:2] == 'NN'
  text_tok = nltk.word_tokenize(text)
  return [word for (word, pos) in nltk.pos_tag(text_tok) if is_noun(pos)]

#test = pos_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
#print(test)
questions_df['bodytext_nouns'] = questions_df['BodyText_Lemma'].apply(pos_text)

print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][4], 160))
print('\n')
print("After: ", questions_df['bodytext_nouns'][4])
#for word,word_class in questions_df['bodytext_pos'][4]:
#   print(word + "," + word_class)

Before:  I use Libsvm to train data and predict classification on semantic analysis problem But it have a performance issue on largescale data because semantic analysis
concern ndimension problem Last year Liblinear be release and it can solve performance bottleneck But it cost too much memory Is MapReduce the only way to solve
semantic analysis problem on big data Or be there any other method that can improve memory bottleneck on Liblinear


After:  ['data', 'classification', 'analysis', 'problem', 'performance', 'issue', 'data', 'analysis', 'concern', 'ndimension', 'problem', 'year', 'release', 'performance', 'bottleneck', 'memory', 'MapReduce', 'way', 'analysis', 'problem', 'data', 'method', 'memory', 'bottleneck', 'Liblinear']


In [None]:
'''
5i. Convert to lowercase; do this for bodytext nouns and NER also(?)
'''
questions_df['bodytext_lc'] = questions_df['BodyText_Lemma'].str.lower()

print("Before: ", textwrap.fill(questions_df['BodyText_Lemma'][6], 160))
print('\n')
print("After: ", textwrap.fill(questions_df['bodytext_lc'][6], 160))

Before:  We create a social network application for eLearning purpose it be an experimental project that we be research on in our lab It have be use in some case study
for a while and the data in our relational DBMS SQL Server 2008 be get big it be a few gigabyte now and the table be highly connect to each other The
performance be still fine but when should we consider other option Is it the matter of performance


After:  we create a social network application for elearning purpose it be an experimental project that we be research on in our lab it have be use in some case study
for a while and the data in our relational dbms sql server 2008 be get big it be a few gigabyte now and the table be highly connect to each other the
performance be still fine but when should we consider other option is it the matter of performance


In [None]:
'''
5j. Remove stop words - using nltk corpus - let's take a look at those english stopwords first
I am removing re since there is a module re that is important; also "r" in case it is in the stopwords, since this is a language
'''
#print(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
exclude_words = set(('re', "r", 'q'))
new_stop_words = stop_words - exclude_words

questions_df['BodyText_NoStopwords'] = questions_df['bodytext_lc'].apply(lambda x: ' '.join([word for word in x.split() if word not in (new_stop_words)]))
print("Before LC Conversion: ")
print(textwrap.fill(questions_df['BodyText_Lemma'][6], 160))
print('\n')
print("Before StopWord Removal: ")
print(textwrap.fill(questions_df['bodytext_lc'][6], 160))
print('\n')
print("After: ", textwrap.fill(questions_df['BodyText_NoStopwords'][6], 160))

Before LC Conversion: 
We create a social network application for eLearning purpose it be an experimental project that we be research on in our lab It have be use in some case study
for a while and the data in our relational DBMS SQL Server 2008 be get big it be a few gigabyte now and the table be highly connect to each other The
performance be still fine but when should we consider other option Is it the matter of performance


Before StopWord Removal: 
we create a social network application for elearning purpose it be an experimental project that we be research on in our lab it have be use in some case study
for a while and the data in our relational dbms sql server 2008 be get big it be a few gigabyte now and the table be highly connect to each other the
performance be still fine but when should we consider other option is it the matter of performance


After:  create social network application elearning purpose experimental project research lab use case study data relational dbms s

# Task 6 Export Results

In [None]:
questions_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24353 entries, 0 to 24352
Data columns (total 63 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   Id                    24353 non-null  int64          
 1   PostTypeId            24353 non-null  int64          
 2   CreationDate          24353 non-null  datetime64[ns] 
 3   Score                 24353 non-null  int64          
 4   ViewCount             24353 non-null  int64          
 5   Body                  24353 non-null  object         
 6   OwnerUserId           24238 non-null  object         
 7   LastActivityDate      24353 non-null  datetime64[ns] 
 8   Title                 24353 non-null  object         
 9   Tags                  24353 non-null  object         
 10  AnswerCount           24353 non-null  int64          
 11  CommentCount          24353 non-null  int64          
 12  FavoriteCount         6708 non-null   object         
 13  C

In [None]:
'''
6. We have cleaned the body text and added features - export for further analysis.
Now write out for safekeeping
'''
import sys
#print(sys.getrecursionlimit())
sys.setrecursionlimit(2000)
print(sys.getrecursionlimit())
pickle_out = open("/content/drive/My Drive/Capstone2/Data/questions_df_ner_results_10282020.pickle","wb")
pickle.dump(questions_df, pickle_out, protocol=pickle.HIGHEST_PROTOCOL)
pickle_out.close()

2000
