## Notebook Overview: Basic Vectorization

This notebook was used for some basic NLP/analysis. A form of this was provided to me by Dr. Polson. I didn't end up using any of the outputs for the rest of the project, but it was helpful to see the preprocessing steps and learn how to create word/sentence tokens and transform those to tfidf vectors and use them in analysis.

In [4]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re

import pandas as pd
from glob import glob
import os
import pandas as pd
import sqlite3

In [5]:
stemmer = PorterStemmer()
nums = r'[0-9]'
stop_words = list(stopwords.words("english"))
def pre_process(text):

    text = text.lower()
    text = re.sub(nums, '',text) # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  #remove punctuation
    text = [word for word in text.split() if word.lower() not in stop_words]  #remove stopwords
    words = ""
    for i in text:  #word stemming
            words += (stemmer.stem(i))+" "
    return words

In [6]:
# read in the data from sqlite DB
datadir = r"C:\Users\keatu\Regis_archive\practicum_data"
dbfile = os.path.join(datadir,"Facebook.db")
con = sqlite3.connect(dbfile)
posts = pd.read_sql("select * from posts",con)
comments = pd.read_sql("select * from comments", con)
replies = pd.read_sql("Select * from replies",con)
con.close()

In [7]:
print("Total posts: {}".format(len(posts)))
print("Total comments: {}".format(len(comments)))
print("Total comment replies: {}".format(len(replies)))

Total posts: 3815
Total comments: 69326
Total comment replies: 123641


In [8]:
# concatenate all dataframes using only text and user id fields
all_text = pd.concat([
                    posts[["user_id","post_id","text"]],
                    comments[["commenter_id","comment_id","comment_text"]].rename(columns={"commenter_id":"user_id","comment_text":"text"}),
                    replies[["commenter_id","comment_id","comment_text"]].rename(columns={"commenter_id":"user_id","comment_text":"text"})
                    ], sort = False)

In [9]:
print("Total unique users: {}".format(all_text["user_id"].nunique()))

Total unique users: 22586


In [10]:
text_features = all_text["text"].apply(pre_process)

In [11]:
#Find the most frequently occuring words that are most meaningful from out text corpus

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=False)
    
    #get the feature names and tf-idf score of top n items
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [12]:
#create TfidVectorizer to create the numerical values
vectorizer = TfidfVectorizer(stop_words="english",decode_error='ignore', lowercase = True, min_df=2)

#numericalize the textFeatures
features = vectorizer.fit_transform(text_features.values.astype('U'))
feature_names = vectorizer.get_feature_names()
#print(vectorizer.vocabulary_)

In [58]:
sorted_items=sort_coo(features.tocoo())

keywords=extract_topn_from_vector(feature_names,sorted_items,20)
print(keywords)

{'covid': 0.001, 'long': 0.002, 'work': 0.001, 'like': 0.001, 'good': 0.002, 'sinc': 0.002, 'post': 0.002, 'bodi': 0.002, 'week': 0.002, 'month': 0.002, 'come': 0.002, 'posit': 0.002, 'mani': 0.002}


In [15]:
search_terms = ["blood clot","heart","cardiovascular","stroke","deep vein thrombosis","embolism","out of breath","shortness of breath","heparin","warfarin","rapid heartbeat","heart rate","lightheaded","sweat","fever","leg pain","leg swelling", "leg swollen","clammy skin","discolor skin","cyanosis"]

In [16]:
relevant = all_text[all_text["text"].str.contains("|".join(search_terms))]

In [19]:
relevant.text.to_csv("../blank.csv")

  """Entry point for launching an IPython kernel.
