In [47]:
import numpy as np
import pandas as pd
import nltk

### Importing the data

In [48]:
df = pd.read_csv("stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


### Text Preprocessing

In [49]:
import string
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

### Tokenize the messages

using the function created above

In [50]:
df['Text'].head(5).apply(text_process)

0    [Kickers, watchlist, XIDE, TIT, SOQ, PNK, CPW,...
1    [user, AAP, MOVIE, 55, return, FEAGEED, indica...
2    [user, Id, afraid, short, AMZN, looking, like,...
3                                         [MNTA, 1200]
4                                           [OI, 2137]
Name: Text, dtype: object

### Vectorization

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [52]:
transformer = CountVectorizer(analyzer=text_process).fit(df['Text'])

# Print total number of vocab words
print(len(transformer.vocabulary_))

13456


In [53]:
# testing on one message to see what is the output
temp = df['Text'][2]
print(temp)

temp_bow = transformer.transform([temp])
print(f'Vector Representation: \n{temp_bow}')
print(f'Shape: {temp_bow.shape}')

user I'd be afraid to short AMZN - they are looking like a near-monopoly in eBooks and infrastructure-as-a-service
Vector Representation: 
  (0, 2106)	1
  (0, 4240)	1
  (0, 6832)	1
  (0, 8292)	1
  (0, 10100)	1
  (0, 10437)	1
  (0, 10506)	1
  (0, 10853)	1
  (0, 12153)	1
  (0, 13068)	1
Shape: (1, 13456)


In [54]:
# transform the whole DataFrame of messages
df_bow = transformer.transform(df['Text'])

In [55]:
print('Shape of Sparse Matrix: ', df_bow.shape)
print('Amount of Non-Zero occurences: ', df_bow.nnz)

sparsity = (100.0 * df_bow.nnz / (df_bow.shape[0] * df_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

Shape of Sparse Matrix:  (5791, 13456)
Amount of Non-Zero occurences:  53890
sparsity: 0


### Weighting and Normalization with TF-IDF

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(df_bow)

In [57]:
df_tfidf = tfidf_transformer.transform(df_bow)
print(df_tfidf.shape)

(5791, 13456)


### Training the model (Using Naive Bayes Classifier)

In [58]:
from sklearn.model_selection import train_test_split
text_train, text_test, sentiment_train, sentiment_test = \
train_test_split(df['Text'], df['Sentiment'], test_size=0.2)

In [59]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [60]:
pipeline.fit(text_train, sentiment_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000001FBD698C3A0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [61]:
predictions = pipeline.predict(text_test)
print(classification_report(predictions,sentiment_test))

              precision    recall  f1-score   support

          -1       0.31      0.82      0.45       157
           1       0.96      0.71      0.82      1002

    accuracy                           0.73      1159
   macro avg       0.64      0.77      0.63      1159
weighted avg       0.87      0.73      0.77      1159

