In [0]:
import os
from google.colab import drive

import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer 
from collections import Counter
from nltk import bigrams
from nltk.stem import PorterStemmer
from textblob import TextBlob
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
 
#mount your Google drive into this notebook
drive.mount('/content/gdrive')
#find the path to your Google drive root
os.getcwd()+"/gdrive/My Drive"
path = os.chdir('/content/gdrive/My Drive/Colab Notebooks/Data Mining/Group Project')

In [0]:
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation

def clean_txt(df, size): #prefer this method
    tweets_text = df['text'].str.replace(r'[^\x00-\x7F]+', '', regex=True) # remove non-character word    
    clean_tweets = []
    for t in tweets_text: # loop through every tweets
        terms_stop = [re.sub(r"\d", "", term, flags=re.I) for term in WhitespaceTokenizer().tokenize(t) # remove term that contains number
                    if term.lower() 
                    not in stop
                    #and not term.isdigit()
                    and not term.startswith(('@','#','http','rt', 'via'))]     
        terms_stop = list(filter(None, terms_stop)) # remove empty/ non_character words        
        terms_stem = [PorterStemmer().stem(word) for word in terms_stop] 
        if len(terms_stop) > size: # only analyze long tweets          
          t=" ".join(terms_stop)
          #t=" ".join(terms_stop)        
          clean_tweets.append(t)
    return clean_tweets

def cand_data(df,cand_name):
    df['candidate'] = np.where(df['text'].str.contains(cand_name, flags=re.IGNORECASE,regex=True),'Filtered', '')
    df = df[df.candidate =='Filtered']
    return df['text']

In [0]:
def svd_func(df,word_in,top_sim):
  clean_tweets = clean_txt(df, 20) # only analyze length of more than 20 words 
  vectorizer= TfidfVectorizer()
  my_matrix = vectorizer.fit_transform([t for t in clean_tweets]).transpose() # words x documents(tweets)
  print(my_matrix.shape)
  words_compressed, _, docs_compressed = svds(my_matrix, k=40) # most data stays in the first 10 dimension, choose 40 to be safe
  docs_compressed = docs_compressed.transpose()
  print(words_compressed.shape)
  print(docs_compressed.shape) 

  word_to_index = vectorizer.vocabulary_
  index_to_word = {i:t for t,i in word_to_index.items()}

  words_compressed = normalize(words_compressed, axis = 1) # PCA

  k= top_sim # number of most closest words
  if word_in not in word_to_index: return "Not in vocab."
  sims = words_compressed.dot(words_compressed[word_to_index[word_in],:]) # U x vector of the search word to find similarity
  asort = np.argsort(-sims)[:k+1] # sorting similarity from biggest to smallest, that's why there is a negative sign
  return [(index_to_word[i],sims[i]/sims[asort[0]]) for i in asort[1:]] # return similar words, similarity

In [0]:
import requests
import bs4

#URL = "https://www.rev.com/blog/transcripts/january-iowa-democratic-debate-transcript"
URL = "https://www.rev.com/blog/transcripts/democratic-debate-transcript-las-vegas-nevada-debate"


requests.get(URL, {}).text
web_page = bs4.BeautifulSoup(requests.get(URL, {}).text, "lxml")
transcript = web_page.body.find_all(name="p")

# Remove links to video
for i in range(len(transcript)):
  transcript[i] = re.sub(r"\(<a .*</a>\)<br/>", '', transcript[i].text)
transcript = pd.DataFrame(transcript,columns=['text'])

In [5]:
svd_func(transcript, 'trump',10)

(2161, 171)
(2161, 40)
(171, 40)


[('donald', 0.9266385017198637),
 ('beat', 0.7840938355251859),
 ('board', 0.7205891109308151),
 ('eight', 0.7205891109308151),
 ('ahead', 0.7205891109308151),
 ('interesting', 0.7205891109308151),
 ('equipped', 0.7205891109308151),
 ('wonderful', 0.7205891109308151),
 ('toss', 0.7205891109308151),
 ('poll', 0.5700803400612271)]

In [6]:
twitter_1 = pd.read_csv('public_database.csv')
trump_twt_1 = svd_func(twitter_1, 'trump',10)
trump_twt_1

(11738, 2565)
(11738, 40)
(2565, 40)


[('hairs', 0.8268590302303673),
 ('unhealthy', 0.7673640050911901),
 ('bloated', 0.7673640050911901),
 ('incoherently', 0.7673640050911901),
 ('wrinkled', 0.7673640050911901),
 ('invest', 0.7393690666029499),
 ('caucused', 0.7337167805822422),
 ('alot', 0.7158926394673832),
 ('chuck', 0.6574975150363199),
 ('coping', 0.6556045285963331)]

In [7]:
twitter_2 = pd.read_csv('public_database-0220.txt')
trump_twt_2 = svd_func(twitter_2, 'trump',10)
trump_twt_2

(14429, 3672)
(14429, 40)
(3672, 40)


[('beats', 0.8002747196496772),
 ('tops', 0.7772178833263852),
 ('sworn', 0.7576827080866215),
 ('knowledgable', 0.7473786214563808),
 ('evasive', 0.7473786214563808),
 ('uttered', 0.7405985267472185),
 ('lashes', 0.7405985267472185),
 ('hispanic', 0.7405985267472185),
 ('longest', 0.7061147822722741),
 ('sheds', 0.7000258355117607)]