## Setup

### Import Libraries

In [1]:
# Import necessary libraries
import re
import pandas as pd
from urllib.parse import urlparse
import nltk
import chardet

# Import NLTK components
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Configure pandas to display the full width of text columns
pd.set_option('display.max_colwidth', None)

### Utility Functions

In [2]:
def get_base_url(url):
    """
    Extract the base domain from a given URL, removing 'www.' if it starts with it.
    """
    # Check if the input is a string to ensure it's a valid URL
    if isinstance(url, str):
        parsed_url = urlparse(url)  # Parse the URL to separate components
        base_url = parsed_url.netloc  # Extract the network location part (base domain)

        # Remove 'www.' if the base_url starts with it
        if base_url.startswith('www.'):
            base_url = base_url[4:]

        return base_url
    else:
        # Return None if input is not a string (invalid URL)
        return None

In [8]:
# Define patterns for cleaning text
username_pattern = r'@[A-Za-z0-9_]+'
url_pattern = r'https?://[^\s]+'
special_chars_and_numbers_pattern = r'[^a-zA-Z\s]'  # Keep only letters and spaces
combined_cleaning_pattern = r'|'.join((username_pattern, url_pattern, special_chars_and_numbers_pattern))

# Extended and financial-context-specific stopwords
custom_stopwords = {'rt', 'ep', 'gt', 'lt', 'amp'}
extended_stopwords = set(stopwords.words('english')) | custom_stopwords

# Initialize Porter Stemmer for word stemming
stemmer = PorterStemmer()

def clean_text(text_column):
    """
    Function to clean and preprocess a list of text entries, tailored for financial sentiment analysis.
    """
    cleaned_texts = []
    
    for text in text_column:
        # Remove usernames, URLs, and non-alphabetical characters
        text = re.sub(combined_cleaning_pattern, ' ', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize text
        words = word_tokenize(text)
        # Stem and remove stopwords
        cleaned_words = [stemmer.stem(word) for word in words if word not in extended_stopwords]
        # Rejoin words into the cleaned text
        cleaned_text = ' '.join(cleaned_words)
        cleaned_texts.append(cleaned_text)
    
    return cleaned_texts

## Data Loading

### Variable definitions

- **DS1**: Financial Tweets
    - https://www.kaggle.com/datasets/davidwallach/financial-tweets
- **DS2**: Financial Phrasebank
    - https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news
- **DS3**: Financial Sentiment Analysis
    - https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis
    - FiQA and Financial Phrasebank combined
- **DS4**: Stock Tweets
    - Surge AI
- **DS5**: Cryptocurrency Reddit Commetns
    - Surge AI

In [9]:
# Load data and labels for dataset DS1
data_ds1 = pd.read_csv('../Datasets/DS1/Data.csv', encoding='utf-8')
labels_ds1 = pd.read_csv('../Datasets/DS1/Labels.csv', encoding='utf-8')

# Combine data and labels into a single DataFrame for DS1
data_ds1 = pd.concat([data_ds1, labels_ds1['sentiment']], axis=1)

# Detect the encoding of the dataset to handle text data properly.
with open('../Datasets/DS2/Data.csv', 'rb') as file:
    encoding_detection = chardet.detect(file.read(10000))

# Load datasets from DS2, DS3, DS4, and DS5 with detected and default encodings
data_ds2 = pd.read_csv('../Datasets/DS2/Data.csv', encoding=encoding_detection['encoding'])
data_ds3 = pd.read_csv('../Datasets/DS3/Data.csv', encoding='utf-8')
data_ds4 = pd.read_csv('../Datasets/DS4/Data.csv', encoding='utf-8')
data_ds5 = pd.read_csv('../Datasets/DS5/Data.csv', encoding='utf-8')

In [10]:
print("===================== DS1 Info =====================")
print(data_ds1.info())

print("\n===================== DS2 Info =====================")
print(data_ds2.info())

print("\n===================== DS3 Info =====================")
print(data_ds3.info())

print("\n===================== DS4 Info =====================")
print(data_ds4.info())

print("\n===================== DS5 Info =====================")
print(data_ds5.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28440 entries, 0 to 28439
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             28440 non-null  object
 1   text           28440 non-null  object
 2   timestamp      28438 non-null  object
 3   source         28437 non-null  object
 4   symbols        28437 non-null  object
 5   company_names  28435 non-null  object
 6   url            22049 non-null  object
 7   verified       28436 non-null  object
 8   sentiment      28440 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 2.0+ MB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 2 columns):
 #   Column                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                         

## Data Formatting

In [11]:
# Standardize column names for DS3, DS2, and set initial mappings for DS3 and DS2
standard_column_names = ['text', 'sentiment']
data_ds3.columns = standard_column_names
data_ds2.columns = ['sentiment', 'text']  # Original order in DS2
data_ds2 = data_ds2.reindex(columns=standard_column_names)
data_ds4.rename(columns={'Sentiment': 'sentiment'}, inplace=True)
data_ds5.rename(columns={'Sentiment': 'sentiment'}, inplace=True)
data_ds4.rename(columns={'Tweet Text': 'text'}, inplace=True)
data_ds4.rename(columns={'Tweet URL': 'url'}, inplace=True)
data_ds5.rename(columns={'Comment Text': 'text'}, inplace=True)
data_ds5.rename(columns={'URL': 'url'}, inplace=True)

# Map sentiment labels to numerical values for DS2, DS3, DS4, DS5
sentiment_mapping = {'positive': 1, 'negative': -1, 'neutral': 0, 'Positive': 1, 'Negative': -1, 'Neutral': 0}
data_ds2['sentiment'] = data_ds2['sentiment'].map(sentiment_mapping).astype('int64')
data_ds3['sentiment'] = data_ds3['sentiment'].map(sentiment_mapping).astype('int64')
data_ds4['sentiment'] = data_ds4['sentiment'].map(sentiment_mapping).astype('int64')
data_ds5['sentiment'] = data_ds5['sentiment'].map(sentiment_mapping).astype('int64')

# Remove rows from data_ds3 that are present in data_ds2
updated_ds3 = pd.merge(data_ds3, data_ds2, on=['text', 'sentiment'], how='outer', indicator=True)
updated_ds3 = updated_ds3[updated_ds3['_merge'] == 'left_only']
updated_ds3.drop(columns=['_merge'], inplace=True)
data_ds3 = updated_ds3

# Drop rows with missing values in all datasets
data_ds1.dropna(how='any', inplace=True)
data_ds2.dropna(how='any', inplace=True)
data_ds3.dropna(how='any', inplace=True)
data_ds4.dropna(how='any', inplace=True)
data_ds5.dropna(how='any', inplace=True)

In [12]:
print("===================== DS1 Info =====================")
print(data_ds1.info())

print("\n===================== DS2 Info =====================")
print(data_ds2.info())

print("\n===================== DS3 Info =====================")
print(data_ds3.info())

print("\n===================== DS4 Info =====================")
print(data_ds4.info())

print("\n===================== DS5 Info =====================")
print(data_ds5.info())

<class 'pandas.core.frame.DataFrame'>
Index: 22048 entries, 0 to 28439
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             22048 non-null  object
 1   text           22048 non-null  object
 2   timestamp      22048 non-null  object
 3   source         22048 non-null  object
 4   symbols        22048 non-null  object
 5   company_names  22048 non-null  object
 6   url            22048 non-null  object
 7   verified       22048 non-null  object
 8   sentiment      22048 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 1.7+ MB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       4845 non-null   object
 1   sentiment  4845 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 75.8+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 1728

# Dataset Formation and Cleaning

## Full Data Cleaning and Stemming

### Dataset Full

In [13]:
# Define the columns to keep
columns_to_keep = ['text', 'sentiment']

# Correct the approach to drop columns without using inplace=True
data_ds1_full = data_ds1.drop(data_ds1.columns.difference(columns_to_keep), axis=1)
data_ds4_full = data_ds4.drop(data_ds4.columns.difference(columns_to_keep), axis=1)
data_ds5_full = data_ds5.drop(data_ds5.columns.difference(columns_to_keep), axis=1)

# Combine all datasets into a single DataFrame
combined_data_full = pd.concat([data_ds1_full, data_ds2, data_ds3, data_ds4_full, data_ds5_full], ignore_index=True)

print("Number of duplicates (before cleaning): ", str(combined_data_full.duplicated('text').sum()))
combined_data_full.drop_duplicates(subset=['text'], inplace=True)

combined_data_full['text'] = clean_text(combined_data_full['text'])

print("\nNumber of NAs: \n", str(combined_data_full.isna().sum()))
combined_data_full.dropna(inplace=True)

combined_data_full.reset_index(drop=True, inplace=True)

Number of duplicates (before cleaning):  1215

Number of NAs: 
 text         0
sentiment    0
dtype: int64


In [14]:
print(combined_data_full.info())

display(combined_data_full.sample(5))

print("\nNumber of NAs: \n", str(combined_data_full.isna().sum()))

print("\nNumber of duplicates (after cleaning): ", str(combined_data_full.duplicated('text').sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28468 entries, 0 to 28467
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       28468 non-null  object
 1   sentiment  28468 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 444.9+ KB
None


Unnamed: 0,text,sentiment
10459,kimberli clark kmb rate reiter jefferi financi group,0
9259,trq turquois hill resourc million gain trq made notabl gain,1
219,critic review gentex gntx versu delphi technolog dlph,-1
14037,zack brokerag anticip kellogg k post quarterli sale billion,0
3021,analyst expect jacob engin group inc jec announc quarterli sale billion,0



Number of NAs: 
 text         0
sentiment    0
dtype: int64

Number of duplicates (after cleaning):  4349


In [15]:
# combined_data_full.to_feather('../Data/Full_Data.feather')

### Dataset 145

In [16]:
# Define the columns to keep
columns_to_keep = ['text', 'sentiment', 'url']

# Correct the approach to drop columns without using inplace=True
data_ds1_145 = data_ds1.drop(data_ds1.columns.difference(columns_to_keep), axis=1)
data_ds4_145 = data_ds4.drop(data_ds4.columns.difference(columns_to_keep), axis=1)
data_ds5_145 = data_ds5.drop(data_ds5.columns.difference(columns_to_keep), axis=1)

# Combine all datasets into a single DataFrame
combined_data_145 = pd.concat([data_ds1_145, data_ds4_145, data_ds5_145], ignore_index=True)

print("\nNumber of 'text' duplicates (before cleaning): ", str(combined_data_145.duplicated('text').sum()))
combined_data_145.drop_duplicates(subset=['text'], inplace=True)

combined_data_145['text'] = clean_text(combined_data_145['text'])

combined_data_145['base_url'] = combined_data_145['url'].apply(get_base_url)
combined_data_145 = combined_data_145.dropna(subset=['base_url'])
combined_data_145.drop(inplace=True, columns=['url'], axis=1)

print("Number of NAs: \n", str(combined_data_145.isna().sum()))
combined_data_145.dropna(how='any', inplace=True)

combined_data_145.reset_index(drop=True, inplace=True)


Number of 'text' duplicates (before cleaning):  693
Number of NAs: 
 text         0
sentiment    0
base_url     0
dtype: int64


In [23]:
print(combined_data_145.info())

display(combined_data_145.sample(5))

print("\nNumber of NAs: \n", str(combined_data_145.isna().sum()))

print("\nNumber of 'text' duplicates (after cleaning): " + str(combined_data_145.duplicated('text').sum()))

display(combined_data_145['sentiment'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22417 entries, 0 to 22416
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       22417 non-null  object
 1   sentiment  22417 non-null  int64 
 2   base_url   22417 non-null  object
dtypes: int64(1), object(2)
memory usage: 525.5+ KB
None


Unnamed: 0,text,sentiment,base_url
11172,citigroup increas tractor suppli tsco price target,0,thelincolnianonline.com
12108,bullish big mac bullish occur pm jul semiconductor corp want ea,0,twitter.com
17333,bullish chart format exxon mobil corpor xom form bullish wedg chart pattern,0,whatsonthorold.com
8467,signal sign market maker send move stock articl bdr,0,falconstocks.com
2148,billion sale expect zimmer biomet hold inc zbh quarter,0,zpr.io



Number of NAs: 
 text         0
sentiment    0
base_url     0
dtype: int64

Number of 'text' duplicates (after cleaning): 4157


sentiment
 0    13025
 1     6930
-1     2462
Name: count, dtype: int64

In [18]:
# combined_data_145.to_feather('../Data/Data145.feather')

### Dataset 1

In [19]:
columns_to_keep = ['text', 'sentiment', 'company_names', 'url']

# Drop all columns except 'text' and 'sentiment'
data_ds1_1 = data_ds1.drop(data_ds1.columns.difference(columns_to_keep), axis=1)

print("Number of 'text' duplicates: ", str(data_ds1_1.duplicated('text').sum()))
data_ds1_1.drop_duplicates(subset=['text'], inplace=True)

data_ds1_1['text'] = clean_text(data_ds1_1['text'])
data_ds1_1['company_names'] = clean_text(data_ds1_1['company_names'])

data_ds1_1['base_url'] = data_ds1_1['url'].apply(get_base_url)

data_ds1_1 = data_ds1_1.dropna(subset=['base_url'])
data_ds1_1 = data_ds1_1.drop(columns=['url'], axis=1)

combined_data_1 = data_ds1_1

print("\nNumber of NAs: \n", str(combined_data_1.isna().sum()))
combined_data_1.dropna(how='any', inplace=True)

combined_data_1.reset_index(drop=True, inplace=True)

Number of 'text' duplicates:  683

Number of NAs: 
 text             0
company_names    0
sentiment        0
base_url         0
dtype: int64


In [22]:
print(combined_data_1.info())

display(combined_data_1.sample(5))

print("\nNumber of NAs: \n", str(combined_data_1.isna().sum()))

print("\nNumber of duplicates (after cleaning): " + str(combined_data_1.duplicated('text').sum()))

display(combined_data_1['sentiment'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21365 entries, 0 to 21364
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           21365 non-null  object
 1   company_names  21365 non-null  object
 2   sentiment      21365 non-null  int64 
 3   base_url       21365 non-null  object
dtypes: int64(1), object(3)
memory usage: 667.8+ KB
None


Unnamed: 0,text,company_names,sentiment,base_url
3706,best day year today trade thx option trade ggal tsla mnkd psmt,marathon petroleum corpor,1,optionsbypros.com
3499,black sheep china loopr walton bytom bytom sinc decemb l via,loew corpor,0,youtu.be
11713,bullish unusu option activ detect wynn,wynn resort,0,optionsonar.com
20406,yet compar behemoth yore aapl peak market share mere market domin,gener motor compani,1,twitter.com
8934,aep american electr power compani inc nyse aep price target rais morgan stanley stock,american electr power compani,0,marketexclusive.com



Number of NAs: 
 text             0
company_names    0
sentiment        0
base_url         0
dtype: int64

Number of duplicates (after cleaning): 4149


sentiment
 0    13025
 1     6307
-1     2033
Name: count, dtype: int64

In [24]:
# combined_data_1.to_feather('../Data/Data1.feather')

## No Data Cleaning and Stemming

### Dataset Full

In [22]:
# Define the columns to keep
columns_to_keep = ['text', 'sentiment']

# Correct the approach to drop columns without using inplace=True
data_ds1_full = data_ds1.drop(data_ds1.columns.difference(columns_to_keep), axis=1)
data_ds4_full = data_ds4.drop(data_ds4.columns.difference(columns_to_keep), axis=1)
data_ds5_full = data_ds5.drop(data_ds5.columns.difference(columns_to_keep), axis=1)

# Combine all datasets into a single DataFrame
combined_data_full = pd.concat([data_ds1_full, data_ds2, data_ds3, data_ds4_full, data_ds5_full], ignore_index=True)

print("Number of duplicates: ", str(combined_data_full.duplicated('text').sum()))
combined_data_full.drop_duplicates(subset=['text'], inplace=True)

print("\nNumber of NAs: \n", str(combined_data_full.isna().sum()))
combined_data_full.dropna(inplace=True)

combined_data_full.reset_index(drop=True, inplace=True)

Number of duplicates:  1215

Number of NAs: 
 text         0
sentiment    0
dtype: int64


In [23]:
print(combined_data_full.info())

display(combined_data_full.sample(5))

print("\nNumber of NAs: \n", str(combined_data_full.isna().sum()))

print("\nNumber of duplicates: ", str(combined_data_full.duplicated('text').sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28468 entries, 0 to 28467
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       28468 non-null  object
 1   sentiment  28468 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 444.9+ KB
None


Unnamed: 0,text,sentiment
16409,REPORT: MGM Resorts sues 1000 victims of Las Vegas shooting in effort to avoid #liability $MGM https://t.co/ROItGf3gi3,-1
9231,Credit Suisse Reaffirms O'Reilly Automotive $ORLY As a Neutral; They Now Have a Price Target Of $274 - https://t.co/0cZJaP5m8B,1
15735,RT @evanalsop4864: Huge coin selection low fees and very reliable! Join Binance! 🙌 Register for BONUS ➡️ https://t.co/sYjWfR1e8D $ARD…,1
16127,Gold prices drop on Jerome Powell testimony. https://t.co/7sMrL9j8Wy via @YouTube $cl $es $nq $SVXY $HD $bac $AAPL… https://t.co/7Z47oj60Dk,-1
1968,It Seems Robert Half International Inc $RHI Will Go Up. Just Reported Less Shorted Shares https://t.co/MqZi2Nu5kS,1



Number of NAs: 
 text         0
sentiment    0
dtype: int64

Number of duplicates:  0


In [18]:
# combined_data_full.to_feather('../Data/Full_Data_NoClean.feather')

### Dataset 145

In [25]:
# Define the columns to keep
columns_to_keep = ['text', 'sentiment', 'url']

# Correct the approach to drop columns without using inplace=True
data_ds1_145 = data_ds1.drop(data_ds1.columns.difference(columns_to_keep), axis=1)
data_ds4_145 = data_ds4.drop(data_ds4.columns.difference(columns_to_keep), axis=1)
data_ds5_145 = data_ds5.drop(data_ds5.columns.difference(columns_to_keep), axis=1)

# Combine all datasets into a single DataFrame
combined_data_145 = pd.concat([data_ds1_145, data_ds4_145, data_ds5_145], ignore_index=True)

print("\nNumber of 'text' duplicates: ", str(combined_data_145.duplicated('text').sum()))
combined_data_145.drop_duplicates(subset=['text'], inplace=True)

combined_data_145['base_url'] = combined_data_145['url'].apply(get_base_url)
combined_data_145 = combined_data_145.dropna(subset=['base_url'])
combined_data_145.drop(inplace=True, columns=['url'], axis=1)

print("\nNumber of NAs: \n", str(combined_data_145.isna().sum()))
combined_data_145.dropna(how='any', inplace=True)

combined_data_145.reset_index(drop=True, inplace=True)


Number of 'text' duplicates:  693

Number of NAs: 
 text         0
sentiment    0
base_url     0
dtype: int64


In [26]:
print(combined_data_145.info())

display(combined_data_145.sample(5))

print("\nNumber of NAs: \n", str(combined_data_145.isna().sum()))

print("\nNumber of duplicates: " + str(combined_data_145.duplicated('text').sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22417 entries, 0 to 22416
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       22417 non-null  object
 1   sentiment  22417 non-null  int64 
 2   base_url   22417 non-null  object
dtypes: int64(1), object(2)
memory usage: 525.5+ KB
None


Unnamed: 0,text,sentiment,base_url
11018,NTT Docomo $DCM &amp; Motorola Solutions $MSI Head-To-Head Comparison https://t.co/iN7PfrLUpB,0,weekherald.com
5355,Assurant Inc. Declares Quarterly Dividend of $0.56 $AIZ https://t.co/O8DoMU1cdi,0,zpr.io
2874,$MMM $AFL $ABBV $ABT $APD $AOS $ADM $T $ADP $BDX $CAH $CVX $CINF $CTAS $CLX $KO $CL $ED $DOV $ECL $EMR $XOM $FRT https://t.co/QC3M6nFHeW,0,seekingalpha.com
17988,"Pfizer responded appropriately to the White House call."" https://t.co/Frn2lezk7n $PFE",0,zerohedge.com
19601,Free chrome extension to hide tweets that over-shill crypto cash tags &amp; promote groups https://t.co/D6aOXBWQQj No… https://t.co/GDB5dJ4aXe,1,shillkill.com



Number of NAs: 
 text         0
sentiment    0
base_url     0
dtype: int64

Number of duplicates: 0


In [12]:
# combined_data_145.to_feather('../Data/Data145_NoClean.feather')

### Dataset 1

In [31]:
columns_to_keep = ['text', 'sentiment', 'company_names', 'url']

# Drop all columns except 'text' and 'sentiment'
data_ds1_1 = data_ds1.drop(data_ds1.columns.difference(columns_to_keep), axis=1)

print("Number of 'text' duplicates: ", str(data_ds1_1.duplicated('text').sum()))
data_ds1_1.drop_duplicates(subset=['text'], inplace=True)

data_ds1_1['base_url'] = data_ds1_1['url'].apply(get_base_url)

data_ds1_1 = data_ds1_1.dropna(subset=['base_url'])
data_ds1_1 = data_ds1_1.drop(columns=['url'], axis=1)

combined_data_1 = data_ds1_1

print("\nNumber of NAs: \n", str(combined_data_1.isna().sum()))
combined_data_1.dropna(how='any', inplace=True)

combined_data_1.reset_index(drop=True, inplace=True)

Number of 'text' duplicates:  683

Number of NAs: 
 text             0
company_names    0
sentiment        0
base_url         0
dtype: int64


In [33]:
print(combined_data_1.info())

display(combined_data_1.sample(5))

print("\nNumber of NAs: \n", str(combined_data_1.isna().sum()))

print("\nNumber of duplicates: " + str(combined_data_1.duplicated('text').sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21365 entries, 0 to 21364
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           21365 non-null  object
 1   company_names  21365 non-null  object
 2   sentiment      21365 non-null  int64 
 3   base_url       21365 non-null  object
dtypes: int64(1), object(3)
memory usage: 667.8+ KB
None


Unnamed: 0,text,company_names,sentiment,base_url
11482,Mattel Sees Unusually Large Options Volume $MAT https://t.co/pUm1fO5CIM,Mattel,0,dailypolitical.com
13904,You can trade for as low as 0.05% fee on Binance! 🤑 Register for BONUS ➡️ https://t.co/WbsCd1Jgim $PBR $XIOS… https://t.co/KNUUkRcn9F,Transocean Ltd.,-1,binance.com
741,Varian Medical Systems Inc. $VAR Expected to Post Earnings of $1.00 Per Share https://t.co/c9IXOflm2G,Varian Medical Systems,1,mareainformativa.com
20448,$1.01 Earnings Per Share Expected for Waste Management Inc. $WM This Quarter https://t.co/7IdDfmdqp0,Waste Management,1,ledgergazette.com
3381,Insider Selling: AutoNation Inc. $AN Director Sells 42000 Shares of Stock https://t.co/GvSxegY7Pj,AutoNation,1,zpr.io



Number of NAs: 
 text             0
company_names    0
sentiment        0
base_url         0
dtype: int64

Number of duplicates: 0


In [None]:
# combined_data_1.to_feather('../Data/Data1_NoClean.feather')