In [None]:
import pandas as pd
import numpy as np
import os
import json

from tqdm import tqdm

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH_PROJECT = '''/content/drive/MyDrive/UCSD/02_Courses_UCSD/04_Fall_Term_2023/01_Advanced_Data_Mining/DSC_250_PROJECT'''
PATH_DATA = os.path.join(PATH_PROJECT, 'datasets')

PATH_DATA_TW_RAW = os.path.join(PATH_DATA, 'tweet/raw')
PATH_DATA_TW_PREP = os.path.join(PATH_DATA, 'tweet/preprocessed')

# Create directory to store the dataframes generated
PATH_TWEET_DF = os.path.join(PATH_DATA, 'tweet_dfs')
if not os.path.exists(PATH_TWEET_DF):
    os.mkdir(PATH_TWEET_DF)

# 1) Unify Tweet Documents into one File

In [None]:
def get_unified_data(PATH_TW_DATA, ticker):
    list_files = os.listdir(os.path.join(PATH_TW_DATA, ticker))

    full_data = []
    for file in tqdm(list_files, position=0):
        file_path = os.path.join(PATH_DATA_TW_RAW, ticker, file)
        data_file = read_data(file_path)

        full_data += data_file
    return pd.DataFrame(full_data)


def read_data(file_path):
    '''
    Reads the information from a binary file and transforms it into a
    '''
    data_file = []
    ticker, date_file = file_path.split('/')[-2:]
    with open(file_path, 'rb') as file:
        for line in file:
            decoded_line = line.decode('utf-8').strip()
            dictionary = json.loads(decoded_line)
            dictionary['ticker'], dictionary['date_file'] = ticker, date_file
            data_file.append(dictionary)

    return data_file

In [None]:
tickers = os.listdir(PATH_DATA_TW_RAW)

In [None]:
for ticker in tqdm(tickers):
    print(f'Processing {ticker}')
    preprocessed_data = get_unified_data(PATH_DATA_TW_PREP, ticker)
    raw_data = get_unified_data(PATH_DATA_TW_RAW, ticker)

    preprocessed_data.to_csv(
        os.path.join(PATH_TWEET_DF, f'{ticker}_prep.csv'),
        index=False
    )

    raw_data.to_csv(
        os.path.join(PATH_TWEET_DF, f'{ticker}_raw.csv'),
        index=False
    )

  0%|          | 0/87 [00:00<?, ?it/s]

Processing AAPL


100%|██████████| 696/696 [02:25<00:00,  4.78it/s]
100%|██████████| 784/784 [00:05<00:00, 136.79it/s]
  1%|          | 1/87 [02:41<3:51:20, 161.40s/it]

Processing AGFS


100%|██████████| 2/2 [00:00<00:00,  4.29it/s]
100%|██████████| 7/7 [00:01<00:00,  6.39it/s]
  2%|▏         | 2/87 [02:43<1:35:29, 67.41s/it] 

Processing AEP


100%|██████████| 142/142 [00:31<00:00,  4.56it/s]
100%|██████████| 165/165 [00:01<00:00, 154.09it/s]
  3%|▎         | 3/87 [03:15<1:11:57, 51.40s/it]

Processing ABBV


100%|██████████| 382/382 [01:27<00:00,  4.39it/s]
100%|██████████| 428/428 [00:01<00:00, 250.39it/s]
  5%|▍         | 4/87 [04:44<1:31:47, 66.36s/it]

Processing ABB


100%|██████████| 71/71 [00:16<00:00,  4.40it/s]
100%|██████████| 89/89 [00:03<00:00, 24.02it/s]
  6%|▌         | 5/87 [05:04<1:07:48, 49.62s/it]

Processing AMGN


100%|██████████| 398/398 [01:23<00:00,  4.76it/s]
100%|██████████| 450/450 [00:01<00:00, 260.89it/s]
  7%|▋         | 6/87 [06:30<1:23:30, 61.86s/it]

Processing AMZN


100%|██████████| 671/671 [02:19<00:00,  4.79it/s]
100%|██████████| 759/759 [00:04<00:00, 183.58it/s]
  8%|▊         | 7/87 [08:58<2:00:03, 90.04s/it]

Processing BABA


100%|██████████| 446/446 [01:33<00:00,  4.78it/s]
100%|██████████| 521/521 [00:03<00:00, 153.24it/s]
  9%|▉         | 8/87 [10:36<2:01:58, 92.64s/it]

Processing BA


100%|██████████| 441/441 [01:36<00:00,  4.57it/s]
100%|██████████| 505/505 [00:01<00:00, 255.70it/s]
 10%|█         | 9/87 [12:15<2:03:07, 94.71s/it]

Processing BRK-A


100%|██████████| 13/13 [00:03<00:00,  4.29it/s]
100%|██████████| 13/13 [00:00<00:00, 320.23it/s]
 11%|█▏        | 10/87 [12:18<1:25:15, 66.44s/it]

Processing BP


100%|██████████| 336/336 [01:08<00:00,  4.87it/s]
100%|██████████| 388/388 [00:01<00:00, 214.84it/s]
 13%|█▎        | 11/87 [13:30<1:25:58, 67.88s/it]

Processing C


100%|██████████| 618/618 [02:07<00:00,  4.84it/s]
100%|██████████| 691/691 [00:03<00:00, 221.39it/s]
 14%|█▍        | 12/87 [15:41<1:49:06, 87.29s/it]

Processing BUD


100%|██████████| 127/127 [00:25<00:00,  4.98it/s]
100%|██████████| 156/156 [00:01<00:00, 155.93it/s]
 15%|█▍        | 13/87 [16:08<1:25:02, 68.96s/it]

Processing BCH


100%|██████████| 9/9 [00:01<00:00,  4.96it/s]
100%|██████████| 11/11 [00:00<00:00, 26.97it/s]
 16%|█▌        | 14/87 [16:10<59:24, 48.82s/it]  

Processing BBL


100%|██████████| 75/75 [00:15<00:00,  4.70it/s]
100%|██████████| 82/82 [00:01<00:00, 52.75it/s]
 17%|█▋        | 15/87 [16:28<47:17, 39.40s/it]

Processing BSAC


100%|██████████| 14/14 [00:02<00:00,  5.07it/s]
100%|██████████| 18/18 [00:00<00:00, 23.05it/s]
 18%|█▊        | 16/87 [16:31<33:52, 28.63s/it]

Processing BAC


100%|██████████| 588/588 [02:04<00:00,  4.73it/s]
100%|██████████| 665/665 [00:02<00:00, 263.98it/s]
 20%|█▉        | 17/87 [18:39<1:08:09, 58.43s/it]

Processing BHP


100%|██████████| 230/230 [00:49<00:00,  4.65it/s]
100%|██████████| 270/270 [00:01<00:00, 238.74it/s]
 21%|██        | 18/87 [19:30<1:04:36, 56.18s/it]

Processing DHR


100%|██████████| 120/120 [00:24<00:00,  4.93it/s]
100%|██████████| 140/140 [00:00<00:00, 260.05it/s]
 22%|██▏       | 19/87 [19:55<53:02, 46.80s/it]  

Processing DUK


100%|██████████| 160/160 [00:33<00:00,  4.75it/s]
100%|██████████| 194/194 [00:00<00:00, 230.37it/s]
 23%|██▎       | 20/87 [20:30<48:12, 43.17s/it]

Processing CMCSA


100%|██████████| 347/347 [01:11<00:00,  4.88it/s]
100%|██████████| 388/388 [00:01<00:00, 278.01it/s]
 24%|██▍       | 21/87 [21:43<57:20, 52.13s/it]

Processing CHL


100%|██████████| 127/127 [00:26<00:00,  4.85it/s]
100%|██████████| 140/140 [00:00<00:00, 215.08it/s]
 25%|██▌       | 22/87 [22:10<48:19, 44.61s/it]

Processing CVX


100%|██████████| 462/462 [01:35<00:00,  4.84it/s]
100%|██████████| 530/530 [00:02<00:00, 253.84it/s]
 26%|██▋       | 23/87 [23:48<1:04:38, 60.60s/it]

Processing CHTR


100%|██████████| 185/185 [00:38<00:00,  4.80it/s]
100%|██████████| 205/205 [00:01<00:00, 196.44it/s]
 28%|██▊       | 24/87 [24:28<57:04, 54.35s/it]  

Processing CODI


100%|██████████| 32/32 [00:06<00:00,  5.06it/s]
100%|██████████| 33/33 [00:00<00:00, 145.12it/s]
 29%|██▊       | 25/87 [24:34<41:21, 40.03s/it]

Processing DIS


100%|██████████| 512/512 [01:44<00:00,  4.90it/s]
100%|██████████| 587/587 [00:02<00:00, 220.82it/s]
 30%|██▉       | 26/87 [26:23<1:01:32, 60.52s/it]

Processing CAT


100%|██████████| 428/428 [01:29<00:00,  4.80it/s]
100%|██████████| 489/489 [00:01<00:00, 305.46it/s]
 31%|███       | 27/87 [27:54<1:09:47, 69.80s/it]

Processing EXC


100%|██████████| 172/172 [00:35<00:00,  4.87it/s]
100%|██████████| 204/204 [00:00<00:00, 207.93it/s]
 32%|███▏      | 28/87 [28:30<58:47, 59.78s/it]  

Processing CSCO


100%|██████████| 514/514 [01:48<00:00,  4.76it/s]
100%|██████████| 582/582 [00:02<00:00, 199.53it/s]
 33%|███▎      | 29/87 [30:22<1:12:53, 75.41s/it]

Processing D


100%|██████████| 638/638 [02:12<00:00,  4.80it/s]
100%|██████████| 726/726 [00:04<00:00, 179.14it/s]
 34%|███▍      | 30/87 [32:41<1:29:48, 94.54s/it]

Processing HD


100%|██████████| 402/402 [01:25<00:00,  4.69it/s]
100%|██████████| 459/459 [00:01<00:00, 265.08it/s]
 36%|███▌      | 31/87 [34:09<1:26:22, 92.55s/it]

Processing HRG


100%|██████████| 33/33 [00:07<00:00,  4.67it/s]
100%|██████████| 34/34 [00:00<00:00, 78.41it/s]
 37%|███▋      | 32/87 [34:17<1:01:28, 67.06s/it]

Processing LMT


100%|██████████| 267/267 [00:53<00:00,  4.99it/s]
100%|██████████| 299/299 [00:01<00:00, 286.22it/s]
 38%|███▊      | 33/87 [35:12<57:01, 63.36s/it]  

Processing HSBC


100%|██████████| 82/82 [00:17<00:00,  4.75it/s]
100%|██████████| 105/105 [00:00<00:00, 168.92it/s]
 39%|███▉      | 34/87 [35:30<43:57, 49.76s/it]

Processing INTC


100%|██████████| 566/566 [01:59<00:00,  4.75it/s]
100%|██████████| 637/637 [00:03<00:00, 173.79it/s]
 40%|████      | 35/87 [37:33<1:02:18, 71.90s/it]

Processing FB


100%|██████████| 692/692 [02:27<00:00,  4.71it/s]
100%|██████████| 780/780 [00:04<00:00, 160.28it/s]
 41%|████▏     | 36/87 [40:11<1:23:06, 97.77s/it]

Processing JNJ


100%|██████████| 498/498 [01:39<00:00,  4.99it/s]
100%|██████████| 561/561 [00:04<00:00, 116.30it/s]
 43%|████▎     | 37/87 [41:57<1:23:20, 100.01s/it]

Processing GD


100%|██████████| 207/207 [00:42<00:00,  4.91it/s]
100%|██████████| 230/230 [00:00<00:00, 282.48it/s]
 44%|████▎     | 38/87 [42:40<1:07:44, 82.94s/it] 

Processing GOOG


100%|██████████| 680/680 [02:23<00:00,  4.75it/s]
100%|██████████| 767/767 [00:04<00:00, 189.31it/s]
 45%|████▍     | 39/87 [45:09<1:22:22, 102.97s/it]

Processing JPM


100%|██████████| 548/548 [01:51<00:00,  4.91it/s]
100%|██████████| 624/624 [00:03<00:00, 157.18it/s]
 46%|████▌     | 40/87 [47:06<1:23:50, 107.03s/it]

Processing HON


100%|██████████| 181/181 [00:37<00:00,  4.86it/s]
100%|██████████| 208/208 [00:01<00:00, 208.00it/s]
 47%|████▋     | 41/87 [47:44<1:06:17, 86.47s/it] 

Processing GE


100%|██████████| 540/540 [01:51<00:00,  4.84it/s]
100%|██████████| 611/611 [00:02<00:00, 257.96it/s]
 48%|████▊     | 42/87 [49:39<1:11:11, 94.93s/it]

Processing MA


100%|██████████| 369/369 [01:16<00:00,  4.82it/s]
100%|██████████| 413/413 [00:01<00:00, 258.45it/s]
 49%|████▉     | 43/87 [50:58<1:06:01, 90.04s/it]

Processing KO


100%|██████████| 480/480 [01:38<00:00,  4.87it/s]
100%|██████████| 549/549 [00:02<00:00, 237.15it/s]
 51%|█████     | 44/87 [52:39<1:07:03, 93.56s/it]

Processing IEP


100%|██████████| 68/68 [00:13<00:00,  5.02it/s]
100%|██████████| 79/79 [00:02<00:00, 34.17it/s]
 52%|█████▏    | 45/87 [52:55<49:12, 70.29s/it]  

Processing ORCL


100%|██████████| 360/360 [01:14<00:00,  4.81it/s]
100%|██████████| 395/395 [00:01<00:00, 249.13it/s]
 53%|█████▎    | 46/87 [54:12<49:22, 72.24s/it]

Processing PCG


100%|██████████| 140/140 [00:30<00:00,  4.57it/s]
100%|██████████| 156/156 [00:00<00:00, 246.02it/s]
 54%|█████▍    | 47/87 [54:44<40:00, 60.01s/it]

Processing PFE


100%|██████████| 465/465 [01:35<00:00,  4.86it/s]
100%|██████████| 528/528 [00:02<00:00, 243.76it/s]
 55%|█████▌    | 48/87 [56:23<46:35, 71.68s/it]

Processing PPL


100%|██████████| 128/128 [00:27<00:00,  4.68it/s]
100%|██████████| 150/150 [00:00<00:00, 194.58it/s]
 56%|█████▋    | 49/87 [56:51<37:09, 58.67s/it]

Processing MO


100%|██████████| 309/309 [01:06<00:00,  4.66it/s]
100%|██████████| 357/357 [00:01<00:00, 187.74it/s]
 57%|█████▋    | 50/87 [58:00<38:01, 61.65s/it]

Processing PEP


100%|██████████| 300/300 [01:00<00:00,  4.97it/s]
100%|██████████| 337/337 [00:01<00:00, 258.20it/s]
 59%|█████▊    | 51/87 [59:02<37:03, 61.75s/it]

Processing MDT


100%|██████████| 227/227 [00:47<00:00,  4.79it/s]
100%|██████████| 258/258 [00:00<00:00, 282.38it/s]
 60%|█████▉    | 52/87 [59:50<33:43, 57.80s/it]

Processing PTR


100%|██████████| 47/47 [00:09<00:00,  4.87it/s]
100%|██████████| 54/54 [00:01<00:00, 33.80it/s]
 61%|██████    | 53/87 [1:00:01<24:51, 43.86s/it]

Processing SRE


100%|██████████| 92/92 [00:19<00:00,  4.71it/s]
100%|██████████| 104/104 [00:00<00:00, 222.00it/s]
 62%|██████▏   | 54/87 [1:00:22<20:12, 36.74s/it]

Processing MSFT


100%|██████████| 640/640 [02:11<00:00,  4.87it/s]
100%|██████████| 724/724 [00:03<00:00, 221.67it/s]
 63%|██████▎   | 55/87 [1:02:38<35:28, 66.51s/it]

Processing MCD


100%|██████████| 491/491 [01:43<00:00,  4.73it/s]
100%|██████████| 560/560 [00:02<00:00, 241.29it/s]
 64%|██████▍   | 56/87 [1:04:24<40:37, 78.62s/it]

Processing SNY


100%|██████████| 171/171 [00:36<00:00,  4.65it/s]
100%|██████████| 196/196 [00:00<00:00, 232.22it/s]
 66%|██████▌   | 57/87 [1:05:02<33:12, 66.42s/it]

Processing PM


100%|██████████| 187/187 [00:37<00:00,  4.92it/s]
100%|██████████| 223/223 [00:00<00:00, 270.90it/s]
 67%|██████▋   | 58/87 [1:05:41<28:07, 58.19s/it]

Processing REX


100%|██████████| 73/73 [00:14<00:00,  4.94it/s]
100%|██████████| 80/80 [00:01<00:00, 47.04it/s]
 68%|██████▊   | 59/87 [1:05:58<21:19, 45.71s/it]

Processing SO


100%|██████████| 202/202 [00:41<00:00,  4.88it/s]
100%|██████████| 233/233 [00:00<00:00, 260.49it/s]
 69%|██████▉   | 60/87 [1:06:40<20:08, 44.75s/it]

Processing MRK


100%|██████████| 421/421 [01:28<00:00,  4.78it/s]
100%|██████████| 478/478 [00:01<00:00, 268.67it/s]
 70%|███████   | 61/87 [1:08:11<25:18, 58.41s/it]

Processing MMM


100%|██████████| 312/312 [01:04<00:00,  4.87it/s]
100%|██████████| 353/353 [00:01<00:00, 253.19it/s]
 71%|███████▏  | 62/87 [1:09:16<25:15, 60.60s/it]

Processing SNP


100%|██████████| 30/30 [00:06<00:00,  4.45it/s]
100%|██████████| 31/31 [00:00<00:00, 140.43it/s]
 72%|███████▏  | 63/87 [1:09:24<17:49, 44.55s/it]

Processing NVS


100%|██████████| 204/204 [00:40<00:00,  5.01it/s]
100%|██████████| 235/235 [00:01<00:00, 203.77it/s]
 74%|███████▎  | 64/87 [1:10:06<16:47, 43.80s/it]

Processing PICO


100%|██████████| 16/16 [00:03<00:00,  4.65it/s]
100%|██████████| 24/24 [00:01<00:00, 15.47it/s]
 75%|███████▍  | 65/87 [1:10:11<11:47, 32.18s/it]

Processing NGG


100%|██████████| 34/34 [00:06<00:00,  5.08it/s]
100%|██████████| 39/39 [00:00<00:00, 41.81it/s]
 76%|███████▌  | 66/87 [1:10:18<08:41, 24.84s/it]

Processing SLB


100%|██████████| 275/275 [00:56<00:00,  4.90it/s]
100%|██████████| 316/316 [00:01<00:00, 249.07it/s]
 77%|███████▋  | 67/87 [1:11:16<11:33, 34.68s/it]

Processing PG


100%|██████████| 379/379 [01:20<00:00,  4.70it/s]
100%|██████████| 431/431 [00:01<00:00, 249.16it/s]
 78%|███████▊  | 68/87 [1:12:39<15:33, 49.12s/it]

Processing SPLP


100%|██████████| 8/8 [00:01<00:00,  4.74it/s]
100%|██████████| 9/9 [00:00<00:00, 42.88it/s]
 79%|███████▉  | 69/87 [1:12:41<10:29, 34.97s/it]

Processing PCLN


100%|██████████| 572/572 [01:56<00:00,  4.91it/s]
100%|██████████| 632/632 [00:02<00:00, 260.65it/s]
 80%|████████  | 70/87 [1:14:41<17:07, 60.42s/it]

Processing RDS-B


100%|██████████| 3/3 [00:00<00:00,  5.27it/s]
100%|██████████| 3/3 [00:00<00:00, 247.13it/s]
 82%|████████▏ | 71/87 [1:14:41<11:19, 42.49s/it]

Processing NEE


100%|██████████| 158/158 [00:31<00:00,  4.98it/s]
100%|██████████| 183/183 [00:01<00:00, 138.37it/s]
 83%|████████▎ | 72/87 [1:15:14<09:55, 39.69s/it]

Processing WMT


100%|██████████| 485/485 [01:40<00:00,  4.83it/s]
100%|██████████| 560/560 [00:02<00:00, 211.62it/s]
 84%|████████▍ | 73/87 [1:16:58<13:44, 58.92s/it]

Processing VZ


100%|██████████| 461/461 [01:35<00:00,  4.81it/s]
100%|██████████| 521/521 [00:02<00:00, 217.96it/s]
 85%|████████▌ | 74/87 [1:18:37<15:21, 70.87s/it]

Processing V


100%|██████████| 484/484 [01:37<00:00,  4.94it/s]
100%|██████████| 558/558 [00:02<00:00, 224.10it/s]
 86%|████████▌ | 75/87 [1:20:18<15:59, 79.94s/it]

Processing UL


100%|██████████| 67/67 [00:12<00:00,  5.28it/s]
100%|██████████| 74/74 [00:01<00:00, 40.45it/s]
 87%|████████▋ | 76/87 [1:20:33<11:03, 60.35s/it]

Processing WFC


100%|██████████| 439/439 [01:27<00:00,  4.99it/s]
100%|██████████| 498/498 [00:01<00:00, 284.24it/s]
 89%|████████▊ | 77/87 [1:22:03<11:33, 69.31s/it]

Processing TOT


100%|██████████| 122/122 [00:24<00:00,  5.03it/s]
100%|██████████| 141/141 [00:00<00:00, 201.66it/s]
 90%|████████▉ | 78/87 [1:22:28<08:24, 56.05s/it]

Processing UNH


100%|██████████| 336/336 [01:06<00:00,  5.04it/s]
100%|██████████| 373/373 [00:01<00:00, 245.20it/s]
 91%|█████████ | 79/87 [1:23:37<07:58, 59.82s/it]

Processing TSM


100%|██████████| 106/106 [00:22<00:00,  4.82it/s]
100%|██████████| 125/125 [00:00<00:00, 211.77it/s]
 92%|█████████▏| 80/87 [1:23:59<05:40, 48.69s/it]

Processing UN


100%|██████████| 74/74 [00:15<00:00,  4.92it/s]
100%|██████████| 92/92 [00:03<00:00, 23.74it/s]
 93%|█████████▎| 81/87 [1:24:18<03:58, 39.80s/it]

Processing UTX


100%|██████████| 267/267 [00:53<00:00,  5.01it/s]
100%|██████████| 307/307 [00:01<00:00, 213.35it/s]
 94%|█████████▍| 82/87 [1:25:13<03:41, 44.38s/it]

Processing UPS


100%|██████████| 237/237 [00:47<00:00,  4.95it/s]
100%|██████████| 263/263 [00:01<00:00, 223.16it/s]
 95%|█████████▌| 83/87 [1:26:03<03:03, 45.87s/it]

Processing T


100%|██████████| 684/684 [02:20<00:00,  4.88it/s]
100%|██████████| 771/771 [00:04<00:00, 191.51it/s]
 97%|█████████▋| 84/87 [1:28:30<03:48, 76.15s/it]

Processing TM


100%|██████████| 140/140 [00:30<00:00,  4.65it/s]
100%|██████████| 161/161 [00:00<00:00, 227.87it/s]
 98%|█████████▊| 85/87 [1:29:01<02:05, 62.62s/it]

Processing XOM


100%|██████████| 520/520 [01:52<00:00,  4.64it/s]
100%|██████████| 600/600 [00:02<00:00, 235.07it/s]
 99%|█████████▉| 86/87 [1:30:56<01:18, 78.42s/it]

Processing CELG


100%|██████████| 476/476 [01:43<00:00,  4.60it/s]
100%|██████████| 541/541 [00:02<00:00, 221.85it/s]
100%|██████████| 87/87 [1:32:43<00:00, 63.94s/it]


In [None]:
preprocessed_data

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,lang,ticker,date_file,retweeted_status,is_quote_status,timestamp_ms,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status
0,Sat May 31 07:48:05 +0000 2014,472645678166855680,472645678166855680,$AAPL Apple's Longtime Public Relations VP Ret...,"<a href=""http://dlvr.it"" rel=""nofollow"">dlvr.i...",False,,,,,...,en,AAPL,2014-05-31,,,,,,,
1,Sat May 31 19:36:43 +0000 2014,472824011584704512,472824011584704512,RT @firstadopter: 60% of Top 25 Tech Companies...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,en,AAPL,2014-05-31,{'created_at': 'Sat May 31 19:04:55 +0000 2014...,,,,,,
2,Sat May 31 08:24:25 +0000 2014,472654821749567488,472654821749567488,"$AAPL stock content, charts, analysis, &amp; m...","<a href=""http://tweetadder.com"" rel=""nofollow""...",False,,,,,...,en,AAPL,2014-05-31,,,,,,,
3,Sat May 31 12:25:35 +0000 2014,472715513311682560,472715513311682560,$AAPL Wolff: The unlikely marriage of Apple an...,"<a href=""http://dlvr.it"" rel=""nofollow"">dlvr.i...",False,,,,,...,en,AAPL,2014-05-31,,,,,,,
4,Sat May 31 01:35:29 +0000 2014,472551910298226688,472551910298226688,Will #Samsung's S5 Success Hurt #Apple Inc? ($...,"<a href=""https://dev.twitter.com/docs/tfw"" rel...",False,,,,,...,en,AAPL,2014-05-31,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18360,Mon May 18 15:22:58 +0000 2015,600320682006695936,600320682006695936,RT @paulwoll: $AAPL heading to the 130.50-130....,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,,,,,...,en,AAPL,2015-05-18,{'created_at': 'Mon May 18 15:22:07 +0000 2015...,,1431962578666,,,,
18361,Mon May 18 15:26:46 +0000 2015,600321638282858496,600321638282858496,RT @TheStreetTech: New Steve Jobs Movie Traile...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,en,AAPL,2015-05-18,{'created_at': 'Mon May 18 13:40:15 +0000 2015...,,1431962806660,"{'media': [{'id': 600294830342742017, 'id_str'...",,,
18362,Mon May 18 16:22:33 +0000 2015,600335676643508224,600335676643508224,RT @CNNMoney: Mark your calendars for the new ...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,,,,,...,en,AAPL,2015-05-18,{'created_at': 'Mon May 18 15:03:06 +0000 2015...,,1431966153666,"{'media': [{'id': 600308377370124288, 'id_str'...",,,
18363,Mon May 18 11:51:51 +0000 2015,600267552728752128,600267552728752128,David Einhorn Still Focused On Tech While Esca...,"<a href=""http://www.apple.com"" rel=""nofollow"">...",False,,,,,...,en,AAPL,2015-05-18,,,1431949911659,,,,


In [None]:
preprocessed_data['text'].to_list()[:10]

["$AAPL Apple's Longtime Public Relations VP Retires http://t.co/muTwAhL8OL",
 'RT @firstadopter: 60% of Top 25 Tech Companies Founded by 1st &amp; 2nd generation Americans (Immigrants) -&gt; $AAPL $GOOG $ORCL $AMZN $EBAY http…',
 '$AAPL stock content, charts, analysis, &amp; more  -  Free  -  http://t.co/Z1XoDlzgGq',
 '$AAPL Wolff: The unlikely marriage of Apple and Beats http://t.co/a8md7deBnU',
 "Will #Samsung's S5 Success Hurt #Apple Inc? ($SSNLF): http://t.co/d03td3n7x0 via @themotleyfool $AAPL",
 "Apple Inc.'s World Wide Developer Conference: 3 Must-Watch Storylines $AAPL http://t.co/4E0vSOaDCL",
 'Looking for the next $A $WLP $GRPN $AAPL #money http://t.co/JdU4Hop897',
 'RT @philstockworld: Long-Term Portfolio Review: 29 Trades in 6 Months - Up 19% - Details $AAPL $ABX $BTU $CLF $EBAY $LULU $RIG $TWTR  @ htt…',
 '@antonwahlman Your post on http://t.co/4tuWFfgV0y about the new #Chromebase was great! $MSFT &amp; $AAPL better watch out!',
 'RT @philstockworld: 11 Trade Ideas Today:

In [None]:
raw_data['text'].to_list()[:10]

["RT @bespokeinvest: Apple ($AAPL) has now lost $58 bil in market cap since Tuesday's open. 419 stocks in the S&amp;P 500 have market caps smalle…",
 '$AAPL - Why Harman International Industries Inc. Stock Hit a Sour Note Today http://t.co/XMY1nzT9Gl',
 'What will the $DOW do tomorrow? http://t.co/RGFWb0kA5h $GOOG $AAPL $FB $EBAY #options #stocks #daytrading http://t.co/VleAV95Omg',
 'Think the value in $GOOG $GOOGL  $AAPL and $AMZN  for there cloud value. If $CRM is at $74',
 'APPLE Technicals - Daily Levels $AAPL \u200b\u200b\u200b#aapl http://t.co/BqTxpjCzDc',
 '$AAPL trading like the street just found out Andy Zaky was starting another hedge fund...',
 'I’d love to think the WSJ story edit happened this AM when $AAPL hit $126.65 &amp; started moving back up, but . .  https://t.co/P3QOrzhjbK',
 "$AAPL is still bleeding due to the buyback. It's highly possible we'll see $94 before $188. #Apple #Investing",
 'RT @Quantanamo: Apple Watch fans, it may be time for that tattoo to go. Remo

# 2) Preprocess Text data

In [None]:
PATH_TWEET_DF

'/content/drive/MyDrive/UCSD/02_Courses_UCSD/04_Fall_Term_2023/01_Advanced_Data_Mining/DSC_250_PROJECT/datasets/tweet_dfs'

In [None]:
tickers = [x.split('.')[0] for x in os.listdir(os.path.join(PATH_DATA,'price','raw'))]

In [None]:
len(tickers)

88

NameError: ignored