In [27]:
import nltk
import pandas as pd
import re
from collections import Counter
from nltk.draw import dispersion_plot
import matplotlib.pyplot as plt
import csv

In [2]:
articles = pd.read_csv('../data/us_equities_news_dataset.csv')
articles = articles[articles['content'].notna()]
articles

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249
3,221518,NIO,NIO NVAX among premarket gainers,news,Cemtrex NASDAQ CETX 85 after FY results \n...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039
4,221519,NIO,PLUG NIO among premarket gainers,news,aTyr Pharma NASDAQ LIFE 63 on Kyorin Pharm...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096
...,...,...,...,...,...,...,...,...,...
221508,443024,T,Crude And Steel Still In Sync,opinion,We have been reporting on the trade off betwee...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733
221509,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,It s the largest cell phone provider in the wo...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829
221510,443026,T,Wall Street Exposed Part 3 How Dividends C...,opinion,Before we dicuss how the mechanism of dividend...,2012-07-16,Portfolio Cafe,https://www.investing.com/analysis/wall-street...,129651
221511,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,From start to finish the coming week will hav...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926


# Descriptive statistics
- How many documents
- How many words
- How much lexical variation in the texts
- Average sentence length
- Average document length
- 100 most common words
- 20 most indicative words per class
- Dispersion plot of 'Macbook', iPhone and Chromecast

## Section 1
- Documents
- Words
- Lexical variation
- Sentence Length
- Document length

In [3]:
#number of documents
nr_of_docu = articles['id'].nunique()

In [4]:
# Count words in each row
articles['word_count'] = articles['content'].apply(lambda x: len(x.split()))

#Sum the number of words of each row
total_word_count = articles['word_count'].sum()

In [5]:
#Create a set of the words, these are the unique words (these are split since it gives a memory error)

unique_words = set()

# Tokenize and count unique words in smaller chunks
chunk_size = 1000  # Adjust this value based on your available memory and DataFrame size

for i in range(0, len(articles), chunk_size):
    chunk = articles.loc[i:i+chunk_size-1]  # Get a chunk of the DataFrame
    combined_text = ' '.join(chunk['content'])  # Combine text from the chunk
    chunk_words = combined_text.split()  # Tokenize the chunk
    unique_words.update(chunk_words)  # Add unique words to the set

# Count the total number of unique words
unique_word_count = len(unique_words)

In [6]:
lexical_variation = unique_word_count/total_word_count

In [7]:
#Average document length
avg_doc_length = total_word_count/nr_of_docu

In [8]:
def count_sentences(text):
    sentences = re.split(r'[\n]+|[ ]{2,}(?=[A-Z][a-z])', text)
    return len(sentences)

# Apply the count_sentences function to the 'content' column and save the counts in a new column
articles['sentence_count'] = articles['content'].apply(count_sentences)

In [9]:
nr_of_sentences = articles['sentence_count'].sum()
avg_sentence_length = total_word_count/nr_of_sentences

In [10]:
avg_doc_length_sentence = nr_of_sentences/nr_of_docu

## Section 2: preprocessed data
- Common words
- Dispersion plot

In [11]:
# Common words filtering must be done on the preprocessed dataset
articles_prep = pd.read_csv('../data/preprocessed_article_data.csv')
articles_prep = articles_prep[articles_prep['content'].notna()]

In [12]:
# Combine all text from the 'content' column into a single string
combined_text_prep = ' '.join(articles_prep['content'])

# Tokenize the combined text into words
words_prep = combined_text_prep.split()

In [13]:
# Use Counter to count word frequencies
word_counts_prep = Counter(words_prep)

In [14]:
list_common_words = word_counts_prep.most_common(100)

In [15]:
# # Tokenize the content in each row and store it in a list
# tokenized_texts = [nltk.word_tokenize(text) for text in articles_prep['content']]
#
# # Create an NLTK Text object
# text = nltk.Text(word for tokens in tokenized_texts for word in tokens)
#
# # Create a dispersion plot for specific words
# target_words = ["Macbook", "iPhone", "Chromecast"]
# dispersion_plot(text, target_words, ignore_case=True)
#
# plt.show()

In [16]:
plt.savefig('dispersion_plot.png')

<Figure size 432x288 with 0 Axes>

In [22]:
data_variables = {
    "Number of documents:" : nr_of_docu,
    "Total number of words:": total_word_count,
    "Number unique words:": unique_word_count,
    "Lexical variation: ": round(lexical_variation, 5),
    "Average sentence length is ": [round(avg_sentence_length,2), "Words"],
    "Average document length ": [round(avg_doc_length,2), "Words"],
    "Average document length ": [round(avg_doc_length_sentence,2), "Sentences"],
}

dataf_variables = pd.DataFrame(data_variables)
dataf_variables.to_csv('Analysis statistics.csv', index = False)

In [28]:
output_file = 'common_words.csv'  # The name of the output CSV file

# Open the CSV file for writing
with open(output_file, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)

    # Write the header row (if needed)
    # csv_writer.writerow(['Header1', 'Header2', ...])

    # Write the data from the list of tuples
    for item in list_common_words:
        csv_writer.writerow(item)

print(f'CSV data written to {output_file}')

CSV data written to common_words.csv


## Section 3: prediction data
- 20 most indicative words for each class (20 most common words for stock increase 1 or 0)

In [4]:
#Load the prediction dataset
articles_pred = pd.read_csv('../data/prediction_data.csv')

In [5]:
#Separate dataframes
df_stock_increase_1 = articles_pred.loc[articles_pred['stock_increase'] == 1, ['content']]
df_stock_increase_0 = articles_pred.loc[articles_pred['stock_increase'] == 0, ['content']]

In [6]:
#First for increase = 1

# Combine all text from the 'content' column into a single string
combined_text_pred_1 = ' '.join(df_stock_increase_1['content'])

# Tokenize the combined text into words
words_pred_1 = combined_text_pred_1.split()
# Use Counter to count word frequencies
word_counts_pred_1 = Counter(words_pred_1)
list_common_words_pred_1 = word_counts_pred_1.most_common(20)

In [7]:
#First for increase = 0

# Combine all text from the 'content' column into a single string
combined_text_pred_0 = ' '.join(df_stock_increase_0['content'])

# Tokenize the combined text into words
words_pred_0 = combined_text_pred_0.split()
# Use Counter to count word frequencies
word_counts_pred_0 = Counter(words_pred_0)
list_common_words_pred_0 = word_counts_pred_0.most_common(20)

# Convert all variables to a CSV file

In [9]:
data_variables_2 = {
    "20 most indicative words 1" : list_common_words_pred_1,
    "20 most indicative words 0" : list_common_words_pred_0,
}

In [10]:
dataf_variables_2 = pd.DataFrame(data_variables_2)
dataf_variables_2.to_csv('Analysis statistics_2.csv', index = False)