In [1]:
import os
import json
import time
import glob
import numpy as np
import pandas as pd
import datetime
from json.decoder import JSONDecodeError
from pandas import json_normalize
from matplotlib.backends.backend_pdf import PdfPages
import re
from nltk.stem import WordNetLemmatizer

from pandas_datareader import data as pdr

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


In [2]:
def preprocess_results(day, top_p, parent_dir):
    sub_industry = 'mid_large_cap_stocks'

    tmp_path = os.path.join(os.path.join(parent_dir, sub_industry),  day)

    try:
        os.chdir(tmp_path)
    except:
        print('No such directory found!')

    print(tmp_path)

    '''
        get stock prices and large change stock list
    '''
    large_change_stocks_all_original = {}
    current_prices_original = {}

    for p in top_p:
        current_path = os.path.join(tmp_path, 'threshold_{}'.format(p))

        with open(current_path + '/1_large_change_stocks.json') as f:
            large_change_stocks_all_original[p] = json.load(f)

        with open(current_path + '/2_current_prices.json') as f:
            current_prices_original[p] = json.load(f)

    print('iterations: {}'.format(len(current_prices_original[top_p[0]].keys())))


    open_time = datetime.datetime(2020, int(day.split('-')[1]), \
                                  int(day.split('-')[2]), 21, 31).isoformat()
    print('Open time: {}'.format(open_time))

    if len([k for k, v in large_change_stocks_all_original[top_p[0]].items() if v != []]) == 0:
        print('no large change stocks!')
    elif len([k for k, v in large_change_stocks_all_original[top_p[0]].items() if v != []]) == 1:
        iter_1st = [k for k, v in large_change_stocks_all_original[top_p[0]].items() if v != []][0]
        print(iter_1st)

        print('Time of 1st iteration: {}'.format(large_change_stocks_all_original[top_p[0]][iter_1st][0][3]))
        large_change_stocks_all = large_change_stocks_all_original.copy()
        current_prices = current_prices_original.copy()

        if large_change_stocks_all_original[top_p[0]][iter_1st][0][3] < open_time:
            # remove 1st iteration before 09:30
            print('Time of 1st iteration before 09:30:00...')
            print('Remove 1st iteration in large change stocks dataset')

            large_change_stocks_all[top_p[0]][iter_1st] = []

    elif len([k for k, v in large_change_stocks_all_original[top_p[0]].items() if v != []]) >= 2:
        iter_1st = [k for k, v in large_change_stocks_all_original[top_p[0]].items() if v != []][0]
        iter_2nd = [k for k, v in large_change_stocks_all_original[top_p[0]].items() if v != []][1]
        print(iter_1st, iter_2nd)
        print('Time of 1st iteration: {}'.format(large_change_stocks_all_original[top_p[0]][iter_1st][0][3]))
        large_change_stocks_all = large_change_stocks_all_original.copy()
        current_prices = current_prices_original.copy()

        if large_change_stocks_all_original[top_p[0]][iter_1st][0][3] < open_time:
            # remove 1st iteration before 09:30
            print('Time of 1st iteration before 09:30:00...')
            print('Remove 1st iteration in large change stocks dataset')

            large_change_stocks_all[top_p[0]][iter_1st] = []

        if current_prices_original[top_p[0]][iter_1st][0][1] < open_time and current_prices_original[top_p[0]][iter_2nd][0][1] < open_time:
            print('Time of 1st and 2nd iterations before 09:30:00...')
            print('Remove 1st iteration in current prices dataset')

            current_prices[top_p[0]][iter_1st] = []   
    
    return current_prices, large_change_stocks_all

In [3]:
def get_first_large_change_times(current_stock, large_times):
    first_large_change = {}
    for p in top_p:
        first_large_change[p] = {}

        n_len = len(large_times[p])
        for i in range(n_len):
            if current_stock == large_times[p][i][0]:
                first_large_change[p][current_stock] = (large_times[p][i][3], large_times[p][i][1], large_times[p][i][2])
                break
            else:
                continue
    return first_large_change


In [4]:
'''
   
   Get opening price as the first data point
        
'''
from pandas_datareader import data as pdr

def get_stock_openprice(unique_stocks, day):
    
    large_stock_openprice = {}
    large_stock_hist = {}

    for s in sorted(list(unique_stocks)):
        tmp_hist = pdr.get_data_yahoo(s, start = day, end = day)
        large_stock_hist[s] = tmp_hist
        tmp_hist = tmp_hist[~tmp_hist.index.duplicated(keep='first')]

        large_stock_openprice[s] = tmp_hist.loc[day]

    return large_stock_openprice

In [5]:
def plot_stock_prices(day, top_p, df_thre_at_q,
                     current_prices, large_change_stocks_all, 
                     pic_path):

    if len([k for k, v in large_change_stocks_all[top_p[0]].items() if v != []]) == 0:
        print('No large change stock found!')
    else:

        '''
           stocks_list: 
           n_stocks: number of stocks
        '''

        stocks_list = [x[0] for x in current_prices[top_p[0]]['2']]
        n_stocks = len(stocks_list)

        print('Number of stocks in the list:', n_stocks)


        '''
            Get individual stock prices
                - individual_stock_price: 
                    - key: stock symbol
                    - value: key - timestamp, value - stock price
        '''
        individual_stock_price = {}
        for s in stocks_list:
        #     print(s)
            individual_stock_price[s] = {}

            for p in top_p:
                n_cnts = len(current_prices[p].keys())

                for k, v in current_prices[p].items():
                    for i in v:
                        if i[0] == s:
                            individual_stock_price[s][datetime.datetime.strptime(i[1], "%Y-%m-%dT%H:%M:%S.%f").isoformat()] = \
                                float(str(i[2]).replace(',',''))


        '''
            large_times:
                key: 95th
                value: (symbol, price, %change, time)
            large_stocks:
                key: 95th
                value: set of large change stocks
        '''
        large_times = {}
        large_stocks = {}

        for p in top_p:

            large_times[p] = []
            large_stocks[p] = []

            for k, v in large_change_stocks_all[p].items():
                if len(v) > 0:
                    large_stocks[p].extend([x[0] for x in v])
                    large_times[p].extend([(x[0], float(x[1].replace(',', '')), x[2], datetime.datetime.strptime(x[3], "%Y-%m-%dT%H:%M:%S.%f").isoformat()) for x in v])

            large_stocks[p] = set(large_stocks[p])

        print(large_stocks[top_p[0]], large_times[top_p[0]][0])


        '''
            first_large_change: 
                key: symbol
                value: dict of 
                    key: 95th
                    value: k=symbol, v=(time, price, %change)
        '''

        first_large_change = {}

        unique_stocks = set()
        for k,v in large_stocks.items():
            unique_stocks = unique_stocks.union(v)

        for s in sorted(list(unique_stocks)):
            first_large_change[s] = get_first_large_change_times(s, large_times)

        print(first_large_change)


        '''
            large_stock_price:
                key: symbol
                value: df-stock price for each iteration, index=time, value=price
        '''
        large_stock_price = {}
        for k, v in individual_stock_price.items():
            if k in unique_stocks:
                large_stock_price[k] = v


        for k, v in large_stock_price.items():
            large_stock_price[k] = pd.DataFrame(v.values(), index=v.keys(), columns=[k])
            large_stock_price[k].sort_index(inplace=True)

        print(large_stock_price.keys())


        '''
            Stock price change plot:
                - red dot: first detect large price change
        '''

        large_stock_price_copy = large_stock_price.copy()

        ####
        large_stock_openprice = get_stock_openprice(unique_stocks, day)
        open_time = datetime.datetime(2020, int(day.split('-')[1]), int(day.split('-')[2]), 21, 30).isoformat()

        n_col = ['ro', 'bo', 'yo']

        if not os.path.exists(pic_path):
            os.mkdir(pic_path)

        with PdfPages('{}/all_stocks_price_plot_{}.pdf'.format(pic_path, day)) as pdf:
            for s in sorted(list(unique_stocks)):
                p_time = []
                for p in top_p:
                    if first_large_change[s][p]:
                        p_time.append((p, first_large_change[s][p][s]))
                print(s, p_time)

                if large_stock_price[s].index[0] > open_time:
                    # adding open price in the stock price list 
                    large_stock_price_copy[s] = pd.concat([pd.DataFrame({s: large_stock_openprice[s].loc['Open']}, index=[open_time]), \
                              large_stock_price_copy[s]])
                
                large_stock_price_copy[s] = large_stock_price_copy[s].sort_index()

                # get length of stock prices
                p_len = len(large_stock_price_copy[s].index)

                insert_index = [(p_time[i][0], sum(p_time[i][1][0] > large_stock_price_copy[s].index)) for i in range(len(p_time))]
                print(insert_index)

                if insert_index[0][1] > 1:

                    fig = plt.figure(figsize=(7, 5))
                    plt.plot(range(p_len), large_stock_price_copy[s].values, 'o-')

                    dot = {}
                    for i in range(len(p_time)):
                        dot[i], = plt.plot(insert_index[i][1]-1, p_time[i][1][1], n_col[i])

                    plt.legend(list(dot.values()), ['Percentage Change: {}%'.format(p_time[i][1][2]) for p in p_time], loc = 4)
                    plt.xlabel('Iteration')
                    plt.ylabel('Price')
                    plt.title('{}: {}'.format(day, s))
                    txt = "Stock {}: detect {}% change (threshold {}%) \n at time {} with price ${}".format(s, 
                                         p_time[0][1][2], round(df_thre_at_q[df_thre_at_q['symbols']==s]['threshold_at_{}'.format(top_p[0])].values[0], 2), 
                                         p_time[0][1][0], p_time[0][1][1])
                    plt.text(0.05, 0.95, txt, transform=fig.transFigure, size = 8)
                    pdf.savefig(fig)
                    plt.close()


    
        return unique_stocks, first_large_change

In [6]:
'''
    dates_filename_list: 
    top_p: 
        95th percentile as the threshold to detect a stock with large change
    sub_industry: 
        focus on mid-cap and large-cap companies

'''
parent_dir = '/Users/lin/Desktop/Stock-Market-Analysis/'

top_p = [95]

dates_filename_list = sorted([filename for filename in os.listdir('{}mid_large_cap_stocks/'.format(parent_dir)) 
                              if filename.startswith('2020')])
print(dates_filename_list)


df_thre_at_q = pd.read_csv('{}df_thre_at_q{}.csv'.format(parent_dir,\
                                                               top_p[0]), index_col=0)


['2020-10-02', '2020-10-05', '2020-10-06', '2020-10-07', '2020-10-13', '2020-10-14', '2020-10-15', '2020-10-16', '2020-10-19', '2020-10-20', '2020-10-21', '2020-10-22', '2020-10-23', '2020-10-26', '2020-10-27']


In [7]:
'''
    Stock price movement visualization

'''
current_prices_d = {}
large_change_stocks_all_d ={}
unique_stocks_d = {}
first_large_change_d = {}

for day in dates_filename_list:
    print(day)
    current_prices_d[day], \
    large_change_stocks_all_d[day] = preprocess_results(day, top_p,
                                                       parent_dir)
    if len([k for k, v in large_change_stocks_all_d[day][top_p[0]].items() if v != []]) == 0:
        print('No large change stock found!')
        unique_stocks_d[day] = set()
        first_large_change_d[day] = dict()
    else:
        unique_stocks_d[day], \
        first_large_change_d[day] = plot_stock_prices(day, top_p, df_thre_at_q, 
                                                     current_prices_d[day], large_change_stocks_all_d[day],
                                                     pic_path=os.path.join(parent_dir, 'stock_price_figures'))
    
    print('\n')

2020-10-02
/Users/lin/Desktop/Stock-Market-Analysis/mid_large_cap_stocks/2020-10-02
iterations: 23
Open time: 2020-10-02T21:31:00
1 10
Time of 1st iteration: 2020-10-02T21:25:13.391098
Time of 1st iteration before 09:30:00...
Remove 1st iteration in large change stocks dataset
Number of stocks in the list: 167
{'HSBC', 'LSCC'} ('HSBC', 19.91, 2.05, '2020-10-02T23:41:38.530767')
{'HSBC': {95: {'HSBC': ('2020-10-02T23:41:38.530767', 19.91, 2.05)}}, 'LSCC': {95: {'LSCC': ('2020-10-03T00:10:57.922478', 28.09, 5.58)}}}
dict_keys(['HSBC', 'LSCC'])
HSBC [(95, ('2020-10-02T23:41:38.530767', 19.91, 2.05))]
[(95, 10)]
LSCC [(95, ('2020-10-03T00:10:57.922478', 28.09, 5.58))]
[(95, 12)]


2020-10-05
/Users/lin/Desktop/Stock-Market-Analysis/mid_large_cap_stocks/2020-10-05
iterations: 26
Open time: 2020-10-05T21:31:00
1 2
Time of 1st iteration: 2020-10-05T21:24:19.792687
Time of 1st iteration before 09:30:00...
Remove 1st iteration in large change stocks dataset
Number of stocks in the list: 167
{'E

HEI [(95, ('2020-10-16T01:41:26.464096', 115.28, 5.01))]
[(95, 18)]
TTD [(95, ('2020-10-16T01:12:00.734399', 598.33, 8.51))]
[(95, 16)]
TWST [(95, ('2020-10-16T02:11:11.537468', 98.15, 8.81))]
[(95, 20)]


2020-10-16
/Users/lin/Desktop/Stock-Market-Analysis/mid_large_cap_stocks/2020-10-16
iterations: 26
Open time: 2020-10-16T21:31:00
1 2
Time of 1st iteration: 2020-10-16T21:26:35.446327
Time of 1st iteration before 09:30:00...
Remove 1st iteration in large change stocks dataset
Number of stocks in the list: 167
{'JBHT', 'TDC'} ('TDC', 22.78, 6.9, '2020-10-16T21:41:35.795572')
{'JBHT': {95: {'JBHT': ('2020-10-16T22:27:26.548124', 134.17, 5.41)}}, 'TDC': {95: {'TDC': ('2020-10-16T21:41:35.795572', 22.78, 6.9)}}}
dict_keys(['JBHT', 'TDC'])
JBHT [(95, ('2020-10-16T22:27:26.548124', 134.17, 5.41))]
[(95, 5)]
TDC [(95, ('2020-10-16T21:41:35.795572', 22.78, 6.9))]
[(95, 2)]


2020-10-19
/Users/lin/Desktop/Stock-Market-Analysis/mid_large_cap_stocks/2020-10-19
iterations: 26
Open time: 2020-10-

CAJ [(95, ('2020-10-27T22:28:39.361397', 18.33, 4.0))]
[(95, 5)]
CR [(95, ('2020-10-27T21:41:39.399631', 56.46, 4.65))]
[(95, 2)]
HSBC [(95, ('2020-10-27T21:41:39.399631', 21.81, 5.21))]
[(95, 2)]
LLY [(95, ('2020-10-27T21:41:39.399631', 134.69, 4.95))]
[(95, 2)]
MT [(95, ('2020-10-28T00:54:55.340711', 13.87, 4.67))]
[(95, 15)]
RARE [(95, ('2020-10-28T00:54:55.340711', 98.98, 8.47))]
[(95, 15)]
UMPQ [(95, ('2020-10-28T03:07:34.403026', 12.59, 5.7))]
[(95, 24)]




## News and UGC data

In [8]:

'''
   
   Get opening price as the first data point
        
'''


def get_large_stock_df(day, unique_stocks_d, first_large_change_d):
    

    large_stock_openprice = {}
    large_stock_hist = {}

    for s in sorted(list(unique_stocks_d[day])):
        tmp_hist = pdr.get_data_yahoo(s, start = day, end = day)
        large_stock_hist[s] = tmp_hist
        tmp_hist = tmp_hist[~tmp_hist.index.duplicated(keep='first')]

        large_stock_openprice[s] = tmp_hist.loc[day]


    large_stock_df = pd.DataFrame(0, index=sorted(list(unique_stocks_d[day])), 
                                  columns=large_stock_hist[s].columns)
    large_stock_df['Thre_at_{}'.format(top_p[0])] = 0
    large_stock_df['First_Percent_Change'] = 0
    large_stock_df['First_Stock_Price'] = 0

    for s in sorted(list(unique_stocks_d[day])):
        large_stock_df.loc[s] = large_stock_hist[s].loc[day]
        large_stock_df.loc[s, 'Thre_at_{}'.format(top_p[0])] = df_thre_at_q[df_thre_at_q['symbols']==s]['threshold_at_{}'.format(top_p[0])].values
        large_stock_df.loc[s, 'First_Percent_Change'] = first_large_change_d[day][s][top_p[0]][s][2]
        large_stock_df.loc[s, 'First_Stock_Price'] = first_large_change_d[day][s][top_p[0]][s][1]

    large_stock_df = large_stock_df.assign(Total_Percent_Change = lambda x: 
                                                           round((x['Close'] - x['Open'])/x['Open']*100 ,2))

    print(large_stock_df.shape)

    large_stock_df.reset_index(inplace=True)
    large_stock_df.rename(columns={'index': 'Symbol'}, inplace=True)

    large_stock_df = pd.merge(large_stock_df, stocks_midlargeCap_webscrape_df,
                        how = 'left', on = 'Symbol')

    return large_stock_df

In [9]:
'''
    Get News and UGC dataframe for each day
'''


def get_news_ugc_data(day, tmp_path, unique_stocks_d):

    res_news_df = {}
    res_ugc_df = {}

    for s in sorted(list(unique_stocks_d[day])):

        news_filename_list = glob.glob(os.path.join(tmp_path, '{}/News_*.json'.format(s)))
        ugc_filename_list = glob.glob(os.path.join(tmp_path, '{}/UGC_*.json'.format(s)))

        res_news_dict = {}
        res_ugc_dict = {}

        for news_file in news_filename_list:

            with open(news_file) as f:
                try:
                    res_news_dict[news_file.split('{}/threshold_{}/{}/'.format(day, top_p[0], s))[1]] = json.load(f)
                except JSONDecodeError:
                    pass
            f.close()

        res_news_df[s] = pd.DataFrame({'published': [item['published'] for k, v in res_news_dict.items() for item in v],
                                    'title':[item['title'] for k, v in res_news_dict.items() for item in v]})
        # drop duplicates by title:
        res_news_df[s] = res_news_df[s].drop_duplicates('title').reset_index(drop = True)

        # drop news that do not contain stock key-words in title:
        res_news_df[s] = res_news_df[s][res_news_df[s]['title'].map(lambda x: \
                 any(word in x.lower() for word in stocks_key_words_dict[s]))].reset_index(drop=True)

        print('{}: News dataset shape: {}'.format(s, res_news_df[s].shape))

        for ugc_file in ugc_filename_list:

            with open(ugc_file) as f:
                try:
                    res_ugc_dict[ugc_file.split('{}/threshold_{}/{}/'.format(day, top_p[0], s))[1]] = json.load(f)
                except JSONDecodeError:
                    pass
            f.close()

        res_ugc_df[s] = []
        for k, v in res_ugc_dict.items():
            tmp_df = pd.DataFrame.from_dict(json_normalize(v['messages']), orient='columns')
            res_ugc_df[s].append(tmp_df)

        res_ugc_df[s] = pd.concat(res_ugc_df[s])
        # drop duplicates by message id:
        res_ugc_df[s] = res_ugc_df[s].drop_duplicates('id').reset_index(drop = True)

        print('{}: UGC dataset shape: {}'.format(s, res_ugc_df[s].shape))
    
    return res_news_df, res_ugc_df

In [10]:
stocks_midlargeCap_webscrape_df = pd.read_csv('{}/df_stocks_midlargeCap_webscrape.csv'.format(parent_dir), 
                                              index_col=0)

print(stocks_midlargeCap_webscrape_df.shape)

(167, 11)


In [11]:

'''

    Loading News and UGC data

        stocks_key_words_dict
        large_stock_df_dict
    
    Results: News and UGC
        res_news_df_dict
        res_ugc_df_dict

    
'''
# stocks_key_words_dict
stocks_key_words_dict = {}
del_words = ['Ltd', 'Inc', 'Corp', 'Holdings', 'LLC', 'Corporation', 'PLC',
             'HOLDERs', 'Co', 'Limited', 'Company',
             'Providers', 'group', 'com', 'Brands']
del_words = [w.lower() for w in del_words]

w_lemmatizer = WordNetLemmatizer()

# large_stock_df_dict
large_stock_df_dict = {}

# results: News and UGC
res_news_df_dict = {}
res_ugc_df_dict = {}

for day in dates_filename_list:
    print(day)
    
    if unique_stocks_d[day] == set():
        print('No large change stock found in day {}'.format(day))
    else:
        print('Found large change stocks:', unique_stocks_d[day])
        
        large_stock_df = get_large_stock_df(day, unique_stocks_d, first_large_change_d)
        large_stock_df_dict[day] = large_stock_df

        for s in sorted(list(unique_stocks_d[day])):
            if s not in stocks_key_words_dict:
                tmp_words = []
                tmp_words.append(s.lower())

                tmp_names = [w_lemmatizer.lemmatize(w.lower()) for w in re.findall(r'\w+', large_stock_df[large_stock_df['Symbol'] == s]['Name'].values[0]) \
                             if w.lower() not in del_words]
                tmp_words.append(' '.join(tmp_names))

                tmp_words = list(set(tmp_words))
                stocks_key_words_dict[s] = tmp_words

        tmp_path = os.path.join(os.path.join(os.path.join(parent_dir, 'mid_large_cap_stocks'), 
                 day), 'threshold_{}/'.format(top_p[0]))

        res_news_df_dict[day], \
        res_ugc_df_dict[day] = get_news_ugc_data(day, tmp_path, unique_stocks_d)

        print('\n')

2020-10-02
Found large change stocks: {'HSBC', 'LSCC'}
(2, 10)
HSBC: News dataset shape: (2, 2)
HSBC: UGC dataset shape: (30, 42)
LSCC: News dataset shape: (0, 2)
LSCC: UGC dataset shape: (30, 34)


2020-10-05
Found large change stocks: {'EPR', 'LSCC', 'MYOK', 'REGN', 'AGIO', 'AQN', 'ZION', 'ROK', 'PBR', 'WIT', 'MRVL', 'STLD', 'UMC'}
(13, 10)
AGIO: News dataset shape: (3, 2)
AGIO: UGC dataset shape: (30, 40)
AQN: News dataset shape: (0, 2)
AQN: UGC dataset shape: (30, 40)
EPR: News dataset shape: (9, 2)
EPR: UGC dataset shape: (61, 41)
LSCC: News dataset shape: (0, 2)
LSCC: UGC dataset shape: (30, 34)
MRVL: News dataset shape: (5, 2)
MRVL: UGC dataset shape: (36, 41)
MYOK: News dataset shape: (36, 2)
MYOK: UGC dataset shape: (114, 42)
PBR: News dataset shape: (0, 0)
PBR: UGC dataset shape: (30, 43)
REGN: News dataset shape: (14, 2)
REGN: UGC dataset shape: (428, 43)
ROK: News dataset shape: (12, 2)
ROK: UGC dataset shape: (32, 34)
STLD: News dataset shape: (0, 2)
STLD: UGC dataset shap

#### Word frequency

In [12]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
from nltk.stem import WordNetLemmatizer

nlp = spacy.load('en_core_web_sm')

del_words.extend(['$', '\n', '\n\n', '|'])

In [13]:
day = dates_filename_list[0]
s = sorted(list(unique_stocks_d[day]))[0]
print(day, s)

stocks_key_words_dict[s]

2020-10-02 HSBC


['hsbc']

In [14]:
'''
    Word frequency:
        News
    
        UGC
    
'''
# news
tmp_news_title = res_news_df_dict[day][s]['title'].apply(lambda x: nlp(x))

news_words = [token.text for sent in tmp_news_title for token in sent if not token.is_stop and not token.is_punct and not token.is_space]
news_words = [w_lemmatizer.lemmatize(w.lower()) for w in news_words if w.lower() not in del_words]

news_word_freq = Counter(news_words)

# UGC
tmp_ugc_data = res_ugc_df_dict[day][s]['body'].apply(lambda x: nlp(x))

ugc_words = [token.text for sent in tmp_ugc_data for token in sent if not token.is_stop and not token.is_punct and not token.is_space]
ugc_words = [w_lemmatizer.lemmatize(w.lower()) for w in ugc_words if w.lower() not in del_words]

ugc_word_freq = Counter(ugc_words)

In [15]:
news_word_freq.most_common(20)

[('hsbc', 3),
 ('bank', 3),
 ('market', 2),
 ('nyse', 1),
 ('currently', 1),
 ('99.45', 1),
 ('52-week', 1),
 ('high', 1),
 ('upside', 1),
 ('potential', 1),
 ('surprise', 1),
 ('marketing', 1),
 ('sentinel', 1),
 ('oman', 1),
 ('card', 1),
 ('payment', 1),
 ('report-', 1),
 ('growth', 1),
 ('opportunity', 1),
 ('company', 1)]

In [16]:
ugc_word_freq.most_common(20)

[('hsbc', 37),
 ('pm', 11),
 ('adr', 7),
 ('short', 6),
 ('stock', 6),
 ('form', 4),
 ('6-k', 4),
 ('filed', 4),
 ('sentiment', 4),
 ('close', 3),
 ('li', 3),
 ('15', 3),
 ('jpm', 3),
 ('negative', 3),
 ('social', 3),
 ('medium', 3),
 ('https://socialsentiment.io/stocks/symbol/hsbc/', 3),
 ('week', 3),
 ('+', 3),
 ('soon', 2)]

### PoS tagging

In [17]:
news_words_noun_verb_adj = [(token.text, token.pos_) for sent in tmp_news_title for token in sent \
 if not token.is_stop and not token.is_punct and not token.is_space \
 and token.pos_ in ('VERB', 'NOUN', 'ADJ')]
news_words_noun_verb_adj = [(w_lemmatizer.lemmatize(w.lower()), t) for w, t in news_words_noun_verb_adj \
                           if w.lower() not in del_words]


In [18]:
Counter(news_words_noun_verb_adj).most_common(20)

[(('high', 'ADJ'), 1),
 (('surprise', 'VERB'), 1),
 (('marketing', 'NOUN'), 1),
 (('sentinel', 'NOUN'), 1),
 (('market', 'NOUN'), 1)]