# This is a notebook to for Stackoverflow Preprocessing

> Steps: 
1. Explore the data (length of posts, words, ...)

# Initialize the connection

In [29]:
import os
from google.cloud import bigquery
from google.cloud import bigquery_storage
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import pandas as pd
import matplotlib.pyplot as plt

In [30]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='key.json'
bigquery_client = bigquery.Client(project='bigquery-public-data') 
bigquery_client = bigquery.Client() #stackoverflow

# Query

In [31]:
QUERY = """
Select id, title, body, tags
FROM `bigquery-public-data.stackoverflow.posts_answers`
WHERE CAST(creation_date as DATE) > '2021-01-20'
    """

query_job = bigquery_client.query(QUERY)
data = query_job.to_dataframe()
print(data)

              id title                                               body  \
0       65819402  None  <p>It happened to me when I tried to load a fu...   
1       65819404  None  <p>If you're on Ubuntu 18.04, you can use <cod...   
2       65819406  None  <p>I think I managed to reject the subscriptio...   
3       65819420  None  <p>This task is honestly a horrible example of...   
4       65819422  None  <p>In Silverstripe 4 the Elemental module is n...   
...          ...   ...                                                ...   
241144  66043363  None  <p>If you do it this way I think it will be so...   
241145  66062888  None  <p>we can use this for activating oracle 10 g ...   
241146  66062996  None  <p>Like this?</p>\n<p><a href="https://i.stack...   
241147  66124731  None  <p>any one try this</p>\n<pre class="lang-js p...   
241148  66153004  None  <pre><code>&lt;meta property='og:title' conten...   

        tags  
0       None  
1       None  
2       None  
3       None  


# IMPORT THE LIBRARIES FOR PREPROCESSING

In [32]:
!pip install gensim
!pip install python-Levenshtein
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import gensim



# TOKENIZE, REMOVE STOPWORDS

In [33]:
body = data['body']

def tokenize_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 
        
tokenized_words = list(tokenize_to_words(body))

# STEMMING

In [34]:
stemmer = PorterStemmer()
stemmed = [[stemmer.stem(word) for word in t_list] for t_list in tokenized_words]
print(stemmed[0:10])

[['it', 'happen', 'to', 'me', 'when', 'tri', 'to', 'load', 'fullscreen', 'ad', 'with', 'imag', 'creativ', 'that', 'exceed', 'the', 'size', 'avail', 'to', 'show', 'them', 'in', 'the', 'app'], ['if', 'you', 're', 'on', 'ubuntu', 'you', 'can', 'use', 'code', 'sudo', 'apt', 'instal', 'nvidia', 'cuda', 'toolkit', 'code', 'the', 'version', 'of', 'cuda', 'in', 'that', 'packag', 'as', 'of', 'januari', 'is', 'onc', 'you', 've', 'run', 'that', 'you', 'can', 'confirm', 'that', 'it', 'is', 'inde', 'with', 'code', 'nvcc', 'version', 'code'], ['think', 'manag', 'to', 'reject', 'the', 'subscript', 'by', 'just', 'return', 'null', 'in', 'code', 'presend', 'code', 'it', 'will', 'ignor', 'the', 'subscript', 'messag', 'instead', 'of', 'throw', 'error', 'you', 'can', 'also', 'notifi', 'the', 'user', 'by', 'send', 'notif', 'through', 'socket', 'to', 'the', 'user', 'pre', 'code', 'public', 'chatinterceptor', 'resourcebundl', 'resourcebundl', 'channeluserrepo', 'lazi', 'socket', 'thi', 'resourcebundl', 'resou

# LENGTH OF BODY

In [35]:
print("minimum body length: ", body.str.len().min())
print("maximum body length: ", body.str.len().max())

minimum body length:  37
maximum body length:  42297


# REMOVE STOP WORDS

In [36]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fenypatel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

removed_stop = remove_stopwords(stemmed)
print(removed_stop[0:3])

[['happen', 'tri', 'load', 'fullscreen', 'ad', 'imag', 'creativ', 'exceed', 'size', 'avail', 'show', 'app'], ['ubuntu', 'use', 'code', 'sudo', 'apt', 'instal', 'nvidia', 'cuda', 'toolkit', 'code', 'version', 'cuda', 'packag', 'januari', 'onc', 'run', 'confirm', 'inde', 'code', 'nvcc', 'version', 'code'], ['think', 'manag', 'reject', 'subscript', 'return', 'null', 'code', 'presend', 'code', 'ignor', 'subscript', 'messag', 'instead', 'throw', 'error', 'also', 'notifi', 'user', 'send', 'notif', 'socket', 'user', 'pre', 'code', 'public', 'chatinterceptor', 'resourcebundl', 'resourcebundl', 'channeluserrepo', 'lazi', 'socket', 'thi', 'resourcebundl', 'resourcebundl', 'thi', 'channeluserrepo', 'channeluserrepo', 'thi', 'socket', 'socket', 'overrid', 'public', 'messag', 'lt', 'gt', 'presend', 'nonnul', 'messag', 'lt', 'gt', 'messag', 'nonnul', 'messagechannel', 'channel', 'headeraccessor', 'wrap', 'messag', 'stompcommand', 'subscrib', 'equal', 'headeraccessor', 'getcommand', 'checkisban', 'he

# ADD PROCESSED WORDS TO DATAFRAME

In [38]:
tokenized_frame = pd.DataFrame(columns = ['tokenized'])
j = 0
for i in removed_stop:
    tokenized_frame.loc[j, 'tokenized'] = i
    j = j + 1

In [39]:
final_data = pd.concat([data, tokenized_frame], axis=1)
print(final_data.head())

         id title                                               body  tags  \
0  65819402  None  <p>It happened to me when I tried to load a fu...  None   
1  65819404  None  <p>If you're on Ubuntu 18.04, you can use <cod...  None   
2  65819406  None  <p>I think I managed to reject the subscriptio...  None   
3  65819420  None  <p>This task is honestly a horrible example of...  None   
4  65819422  None  <p>In Silverstripe 4 the Elemental module is n...  None   

                                           tokenized  
0  [happen, tri, load, fullscreen, ad, imag, crea...  
1  [ubuntu, use, code, sudo, apt, instal, nvidia,...  
2  [think, manag, reject, subscript, return, null...  
3  [thi, task, honestli, horribl, exampl, thread,...  
4  [silverstrip, element, modul, namespac, fixtur...  


In [40]:
final_data.to_csv('preprocessed.csv')