In [7]:
import requests
import pandas as pd
import numpy as np
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline
test = input("What would you like to know about today?  ")
stonks = ['AMD','AAPL','INTC']

In [8]:
url = 'https://newsapi.org/v2/everything?'

# Specify the query and number of returns
parameters = {
    'qInTitle': test, # query phrase
    'sortBy': 'popularity', # articles from popular sources and publishers come first
    'pageSize': 100,  # maximum is 100 for developer version
    'apiKey': '32dc1d81f85d44cd959b4428b0308bdd', # your own API key
}

# Make the request
response = requests.get(url, params=parameters)

# Convert the response to JSON format and store it in dataframe
data = pd.DataFrame(response.json())
news_df = pd.concat([data['articles'].apply(pd.Series)], axis=1)

# Select data
final_news = news_df.loc[:,['publishedAt','title']]

# Filter to within one week
final_news['publishedAt'] = pd.to_datetime(final_news['publishedAt'])
final_news['publishedAt'] = final_news['publishedAt'].apply(lambda x: x.replace(tzinfo=None)) #removes timezone
final_news = final_news[pd.to_datetime('now')-final_news['publishedAt']<=pd.to_timedelta(30, unit='d')]
final_news.sort_values(by='publishedAt',inplace=True)
final_news.head()


Unnamed: 0,publishedAt,title
32,2021-04-23 12:00:25,The Rumor of a Potential Partnership Between L...
24,2021-04-23 17:46:14,3 reasons to be bearish on Intel despite a new...
46,2021-04-26 10:00:00,When Do Third-Party Cookies End? Apple (AAPL) ...
52,2021-04-26 12:36:56,Apple (AAPL) Boosts U.S. Investment 20% Over F...
17,2021-04-26 16:42:07,Tim Cook reportedly told Mark Zuckerberg that ...


In [9]:
data = pd.read_csv("all-data.csv",delimiter=',',encoding='latin-1',header=None)
data.columns = ["label", "text"]
data.head()

Unnamed: 0,label,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [10]:
features = data.text.values
labels = data.label.values
processed_features = [] 
def process2(): 
    # alternatively could use list version
    #just different implementations u can consider, rly up to u
    features = data.text.values
    for text in features:
        processed_features.append(process1(text))
def process1(features):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features))
    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    # Converting to Lowercase
    return processed_feature.lower()
data.text = data.text.apply(process1).tolist()

In [11]:
#this is for installing nltk stopwords
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#nltk.download()

In [12]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

#Use TF-IDF to vectorize our words
#TF = freq of word in doc / total words in doc
#IDF = log(total # of docs/# of docs containing word)
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
#max_features specifies 2500 most frequent words
#max_df specifies words that occur in a max of 80% of docs
#min_df specifies words that occur in at least 7 docs
#stopwords are excluded, e.g. "it" or "am"
processed_features = vectorizer.fit_transform(data.text).toarray()

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

In [14]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [15]:
predictions = text_classifier.predict(X_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[ 45  59  24]
 [  6 540  29]
 [  8 128 131]]
              precision    recall  f1-score   support

    negative       0.76      0.35      0.48       128
     neutral       0.74      0.94      0.83       575
    positive       0.71      0.49      0.58       267

    accuracy                           0.74       970
   macro avg       0.74      0.59      0.63       970
weighted avg       0.74      0.74      0.72       970

0.7381443298969073


In [18]:
#instead, we can create new processed_features by concatenating the two datasets
#together and vectorizing
processed_features = vectorizer.fit_transform(pd.concat([data.text, final_news.title])).toarray()

In [19]:
#now this stuff becomes our modelling dataset
processed_features[:data.shape[0]]
#and this is our predicting dataset
processed_features[data.shape[0]:]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.23456712, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [20]:
#X_train, X_test, y_train, y_test = train_test_split(processed_features[:data.shape[0]], labels, test_size=0.2, random_state=0)
X_train = processed_features[:data.shape[0]]
y_train = labels

In [21]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [22]:
predictions = text_classifier.predict(processed_features[data.shape[0]:])
predictions

array(['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'positive',
       'neutral', 'positive', 'neutral', 'positive', 'neutral',
       'positive', 'neutral', 'neutral', 'positive', 'neutral', 'neutral',
       'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive',
       'positive', 'neutral', 'positive', 'neutral', 'positive',
       'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral',
       'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral',
       'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral',
       'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'neutral',
       'neutral'], dtype=object)