# <center> HW4 - LSA based semantics analysis </center>

&copy; 2023 Kaiwen Zhou

# Import Packages

In [1]:
import re
import pandas as pd
import pickle
import os
import time
import sys


import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt
%matplotlib inline

# (a) Download Dataset, Remove URLs then Save to CSV

In [2]:
df = pd.read_csv("https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment/raw/main/sent_train.csv")
print(df['text'][:10])
for i, text, in enumerate(df['text']):
    df['text'][i] = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text, flags=re.MULTILINE)
df.to_csv("sent_train.csv")

0    $BYND - JPMorgan reels in expectations on Beyo...
1    $CCL $RCL - Nomura points to bookings weakness...
2    $CX - Cemex cut at Credit Suisse, J.P. Morgan ...
3    $ESS: BTIG Research cuts to Neutral https://t....
4    $FNKO - Funko slides after Piper Jaffray PT cu...
5    $FTI - TechnipFMC downgraded at Berenberg but ...
6        $GM - GM loses a bull https://t.co/tdUfG5HbXy
7    $GM: Deutsche Bank cuts to Hold https://t.co/7...
8                   $GTT: Cowen cuts to Market Perform
9    $HNHAF $HNHPD $AAPL - Trendforce cuts iPhone e...
Name: text, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][i] = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text, flags=re.MULTILINE)


check if they look nice

In [3]:
df['text'][:10]

0    $BYND - JPMorgan reels in expectations on Beyo...
1    $CCL $RCL - Nomura points to bookings weakness...
2    $CX - Cemex cut at Credit Suisse, J.P. Morgan ...
3                 $ESS: BTIG Research cuts to Neutral 
4     $FNKO - Funko slides after Piper Jaffray PT cut 
5    $FTI - TechnipFMC downgraded at Berenberg but ...
6                               $GM - GM loses a bull 
7                     $GM: Deutsche Bank cuts to Hold 
8                   $GTT: Cowen cuts to Market Perform
9    $HNHAF $HNHPD $AAPL - Trendforce cuts iPhone e...
Name: text, dtype: object

**Looks good.**

# (b) Create a doc2vec(doc, tfidf_vectorizer) function

In [4]:
# Setting up the TfidfVectorizer to fit the doc and generate 
# vectorized outcome and the feature vector
Tfidf_vectorizer = TfidfVectorizer(
    max_df=0.5, # ignore terms which occur in more than half of the documents
    max_features=10000,
    min_df=2, # ignore terms which occur in less than 2 documents
    stop_words='english',
    norm='l2',
    use_idf=True, 
    analyzer='word',
    token_pattern = '(?u)\\b[a-zA-Z]\\w+\\b'
)

# Setting up the CountVectorizer to count the # of each feature for each doc
count_vectorizer = CountVectorizer(max_df=0.5, # ignore terms which occur in more than half of the documents
                                   max_features=10000,
                                   min_df=2, # ignore terms which occur in less than 2 documents
                                   stop_words='english',
                                   analyzer='word',
                                   token_pattern = '(?u)\\b[a-zA-Z]\\w+\\b'
            )

# vectorize the given doc
def docs2vec(docs, tfidf_vectorizer):
    vec = tfidf_vectorizer.fit_transform(docs).toarray()
    doc_features = tfidf_vectorizer.get_feature_names()
    doc_counts = count_vectorizer.fit_transform(docs).toarray()
    return vec, doc_features, doc_counts

In [5]:
# tidy up the corpus
full_corpus = list(df['text'])

# Train tfidf_vectorizer on the corpus (full dataset).
full_corpus_tfidf, full_corpus_feature_names, full_corpus_feature_count = docs2vec(full_corpus, Tfidf_vectorizer)



### check the shape of the vectorized corpus

In [6]:
full_corpus_tfidf.shape

(9543, 6762)

### check the vectorized corpus

In [7]:
full_corpus_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

**Surely, it's a sparse matrix.**

### check the fitted features

In [8]:
full_corpus_feature_names[:10]

['a320neo',
 'a350',
 'aal',
 'aaoi',
 'aapl',
 'ab',
 'abb',
 'abbv',
 'abbvie',
 'abc']

### Count Vectorized matrix containing number of occurance of each feature

In [9]:
full_corpus_feature_count.shape

(9543, 6762)

In [10]:
full_corpus_feature_count

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# (c) doc to vectors examples

In [11]:
doc1 = 'Jabberwocky'
doc2 = 'buy $MSFT sell $AAPL hold Brent'
doc3 = 'bullish #stocks'

print("the corresponding vector for doc1 is ", Tfidf_vectorizer.transform([doc1]).toarray())
print("the corresponding vector for doc2 is ", Tfidf_vectorizer.transform([doc2]).toarray())
print("the corresponding vector for doc3 is ", Tfidf_vectorizer.transform([doc3]).toarray())

the corresponding vector for doc1 is  [[0. 0. 0. ... 0. 0. 0.]]
the corresponding vector for doc2 is  [[0. 0. 0. ... 0. 0. 0.]]
the corresponding vector for doc3 is  [[0. 0. 0. ... 0. 0. 0.]]


### If you insist to know, uncomment the code below 

In [12]:
# np.set_printoptions(threshold=sys.maxsize)
# print("the corresponding vector for doc1 is ", Tfidf_vectorizer.transform([doc1]).toarray())
# print("the corresponding vector for doc2 is ", Tfidf_vectorizer.transform([doc2]).toarray())
# print("the corresponding vector for doc3 is ", Tfidf_vectorizer.transform([doc3]).toarray())
# np.set_printoptions(threshold=False)

# (d) LSA Recommender Pipeline

**Here the question requests us to output the matrix containing lists of strings corresponding to the given str; however, the results are very ugly and hard to compare since every string has different length, so I decide to output the indices and print out the corresponding strings at the end.**

In [27]:
t0 = time.time()
# Rewrite the NearestNeighbors class to make it compatible with Pipepline and 
# introduce a predict function to the class
class predict_KNeighbors(NearestNeighbors):
    def transform(self, X=None, y=None):
        return self
    
    def predict(self, X=None, n_neighbors=5, return_distance=False):
        return super(predict_KNeighbors, self).kneighbors(X, n_neighbors, return_distance)


for n_components in [50, 200, 500]:
    
    # Construct Pipepline: vectorizer --> SVD --> KNN
    lsarec = make_pipeline(
        # tfidf_vectorizer
        TfidfVectorizer(
            max_df=0.5, # ignore terms which occur in more than half of the documents
            max_features=10000,
            min_df=2, # ignore terms which occur in less than 2 documents
            stop_words='english',
            norm='l2',
            use_idf=True, 
            analyzer='word',
            token_pattern = '(?u)\\b[a-zA-Z]\\w+\\b'
        ),
        # Project the tfidf vectors onto the first N principal components.
        TruncatedSVD(
            n_components=n_components,
            random_state=42
        #     algorithm='arpack'
        ),
        # Build a k-NN classifier. Use k = 5 (majority wins)
        predict_KNeighbors(
            n_neighbors=5, 
            algorithm='brute',
            metric='l2'
        )
    )

    # pipeline the whole corpus
    full_corpus_lsa = lsarec.fit_transform(full_corpus)
    
    print(f'For n_components={n_components}, model={full_corpus_lsa}')
    
    # predict the 5 nearest neighbors of the given strings and output their indices
    predicted_str_idx = lsarec.predict(['Fed', '$MSFT', 'tech rally', 'disappoint earnings'])
    # print their indices
    print("Indices Grid: \n", predicted_str_idx)
    # print the 5 nearest stings for each input string
    for i, text in enumerate(['Fed', '$MSFT', 'tech rally', 'disappoint earnings']):
        print(f'\nTOP 5 CLOEST strings for "{text}" are: ')
        for j in range(5):
            print(f'\t{j+1}',full_corpus[predicted_str_idx[i][j]])
            
    print("  done in %.3fsec\n" % (time.time() - t0))

For n_components=50, model=predict_KNeighbors(algorithm='brute', metric='l2')
Indices Grid: 
 [[ 687  622  682  511  650]
 [5034 4869 7334 8189 8085]
 [7874 9364 6446 2099 6524]
 [2604 2568 2675 2487 2674]]

TOP 5 CLOEST strings for "Fed" are: 
	1 The Fed is on hold for now, but it might not take much to change that
	2 LIVE: Fed Chair Jerome Powell speaks on today's decision to hold rates ▶️


	3 The Fed Faces a Housing Conundrum
	4 Chairman Jerome Powell speaks after Fed leaves interest rates unchanged 
	5 Powell's 'half-full' U.S. glass sturdy but still at risk for spills as Fed meets
 

TOP 5 CLOEST strings for "$MSFT" are: 
	1 Riassunto: Priyanka Chopra Jonas e Crocs doneranno 50.000 paia di zoccoli classici all’UNICEF
	2 More later. 
	3 Ruth Bader Ginsburg Hospitalized Again 
	4 $EXPE continuation 
	5 Signet Jewelers Finally Regains Some Sparkle

TOP 5 CLOEST strings for "tech rally" are: 
	1 $OPK EOD rally !!!
	2 Lyft and Peloton Rally, Defying the Mantle of 2019’s Least-Loved De

# (f) Lemmatization

In latent semantic analysis (LSA), lemmatization refers to the process of reducing words to their base form, or lemma. This is done to group together words that have the same root but different inflections, such as "walk," "walking," and "walked."

Lemmatization is important in LSA because it helps to reduce the dimensionality of the text data by collapsing all inflected forms of a word into a single token. This can improve the accuracy of the analysis by reducing the noise caused by redundant or similar words.

For example, without lemmatization, the words "walk," "walked," "walking," and "walks" would be treated as separate words, even though they all have the same root. By lemmatizing these words to their base form "walk," LSA can more accurately represent the underlying meaning of the text.


In [14]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize    
from nltk.corpus import stopwords
import nltk

#nltk.download() # Used for downloading different stuff

# Download stopwords list
nltk.download('punkt')
#nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
    
# Lemmatize the stop words
tokenizer=LemmaTokenizer()
print(stop_words)
print(' '.join(stop_words))
token_stop = tokenizer(' '.join(stop_words))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhoukaiwen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{"haven't", 'yours', 'few', 'have', 'o', 'yourselves', 'himself', 'only', 'in', 'ourselves', 'needn', "you'll", 'into', 'ain', "needn't", 'what', 'which', "doesn't", 'shouldn', 'the', 'we', 'again', 'aren', 'hasn', 'if', 'some', 'after', 'between', 'how', 'very', 'more', 'wasn', 'during', 'hadn', 'yourself', "shan't", 'theirs', 'will', "wasn't", 'wouldn', 'itself', 'so', 'all', 'any', 'didn', 'below', 'them', 'he', 'nor', 'his', 'now', 'has', 'whom', 'it', 'hers', 'my', 'at', 'by', 'mustn', "you'd", 'do', 'no', 'this', "it's", 'off', 'am', 'doing', 'for', 'too', 'should', 'themselves', 's', 'and', "you've", 'under', 'me', 'because', 'where', "you're", 'y', "won't", 'doesn', 'about', 'your', 'as', 'were', 'does', 'just', "aren't", 'through', 'she', 'her', 'herself', 'from', 'there', 'won', 'that', 'our', 'its', 'further', 'with', 'having', 'haven', "hadn't", 'these', 'been', 'being', "should've", 'those', 'up', 'or', 'above', 't', "weren't", 'same', 'was', 'couldn', 'each', 'an', 'are',

### Incorporate the lemmatization into the Pipeline

In [18]:
t0 = time.time()
for n_components in [50, 200, 500]:
    # Construct Pipepline: vectorizer --> SVD --> KNN
    lemm_lsarec = make_pipeline(
        # tfidf_vectorizer incorporated lemmatized tokenizer
        TfidfVectorizer(
            max_df=0.5, # ignore terms which occur in more than half of the documents
            max_features=10000,
            min_df=2, # ignore terms which occur in less than 2 documents
            stop_words=token_stop,
            norm='l2',
            use_idf=True, 
            analyzer='word',
            token_pattern = '(?u)\\b[a-zA-Z]\\w+\\b',
            tokenizer=tokenizer
        ),
        # Project the tfidf vectors onto the first N principal components.
        TruncatedSVD(
            n_components=n_components,
            random_state=42
        #     algorithm='arpack'
        ),
        # Build a k-NN classifier. Use k = 5 (majority wins)
        predict_KNeighbors(
            n_neighbors=5, 
            algorithm='brute', 
            metric='l2'
        )
    )

    # pipeline the whole corpus
    full_corpus_lemm_lsa = lemm_lsarec.fit_transform(full_corpus)
    
    print(f'For n_components={n_components}, model={full_corpus_lemm_lsa}')
    # predict the 5 nearest neighbors of the given strings and output their indices
    predicted_str_idx = lemm_lsarec.predict(['Fed', '$MSFT', 'tech rally', 'disappoint earnings'])
    # print their indices
    print("Indices Grid: \n", predicted_str_idx)
    # print the 5 nearest stings for each input string
    for i, text in enumerate(['Fed', '$MSFT', 'tech rally', 'disappoint earnings']):
        print(f'\nTOP 5 CLOEST strings for "{text}" are: ')
        for j in range(5):
            print(f'\t{j+1}',full_corpus[predicted_str_idx[i][j]])
            
    print("  done in %.3fsec\n" % (time.time() - t0))

For n_components=50, model=predict_KNeighbors(algorithm='brute', metric='l2')
Indices Grid: 
 [[ 622  511  481  621  562]
 [8160 8189 8153 8268 7954]
 [3677 2099 4233 6447 1536]
 [2674 2872 2846 2609 2817]]

TOP 5 CLOEST strings for "Fed" are: 
	1 LIVE: Fed Chair Jerome Powell speaks on today's decision to hold rates ▶️


	2 Chairman Jerome Powell speaks after Fed leaves interest rates unchanged 
	3 An emerging priority for Powell Fed: The plight of the poor  
	4 LIVE: Fed Chair Jerome Powell delivers remarks at the decade's last FOMC meeting 
	5 Fed: Powell stressed policy dependent on incoming information

TOP 5 CLOEST strings for "$MSFT" are: 
	1 $BIIB 😀 
	2 $EXPE continuation 
	3 $APOP on scans.
	4 $SOXL 
	5 All these names are working. Buying $BZH $HOV

TOP 5 CLOEST strings for "tech rally" are: 
	1 Sterling rallies as Tories unite behind Brexit plan
	2 Buenos Aires Surrender Sends Argentine Bonds on a Wild Rally
	3 Brexit Bulletin: The Cliff Edge is Back
	4 Boris Johnson Set to B

**We can see the results have changed.**