In [1]:

#for text pre-processing
import re, string
%pip install nltk
# if run in VS, use %; if run in Jupter, delete the line above


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import csv
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# SVM ML
from sklearn import svm
from sklearn.model_selection import GridSearchCV

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip list


Package                            Version
---------------------------------- --------------------
absl-py                            1.1.0
acnportal                          0.3.3
aiohttp                            3.8.4
aiosignal                          1.3.1
alabaster                          0.7.12
alpha-vantage                      2.3.1
anaconda-client                    1.9.0
anaconda-navigator                 2.1.1
anaconda-project                   0.10.1
anyio                              2.2.0
appdirs                            1.4.4
applaunchservices                  0.2.1
appnope                            0.1.2
appscript                          1.1.2
argh                               0.26.2
argon2-cffi                        20.1.0
arrow                              0.13.1
asn1crypto                         1.4.0
astroid                            2.6.6
astropy                            4.3.1
astunparse                         1.6.3
async-generator                    

In [None]:
stock_raw = pd.read_csv('sentiment_stock_data.csv',index_col=[0])

In [None]:
# Class distribution 
value_counts = stock_raw['Sentiment'].value_counts()
value_counts

In [None]:
# Number of words in a news
def count_unique_words(sentence):
    if isinstance(sentence, str):  # Check if the value is a string
        # Split the sentence into words, convert to a set to remove duplicates, and count
        return len(set(sentence.split()))
    else:
        # If the value is not a string, return 0 (or you could choose to return NaN)
        return 0

# Count the unique words in each row of the 'Sentence' column and create a new column with the counts
stock_raw['Vocabulary_Count'] = stock_raw['Sentence'].apply(count_unique_words)

# Display the DataFrame with the new 'Vocabulary_Count' column
print(stock_raw[['Sentence', 'Vocabulary_Count']])

In [None]:

# Calculate the mean, median, and max & min of the word count
mean_count = stock_raw['Vocabulary_Count'].mean()
median_count = stock_raw['Vocabulary_Count'].median()
max_count = stock_raw['Vocabulary_Count'].max()
min_count = stock_raw['Vocabulary_Count'].min()

# Display the calculated statistics
print(f"Mean word count: {mean_count}")
print(f"Median word count: {median_count}")
print(f"Maximum word count: {max_count}")
print(f"Minimum word count: {min_count}")


In [None]:
## Text pre-processing

## Simple text cleaning processes
# 1. Clean missing values 

stock_raw.isna().sum()


In [None]:
# To drop rows with N/A values and update the DataFrame in place:
stock_cleaned = stock_raw.copy()
stock_cleaned.dropna(inplace=True)

# To assign the result to the same or a new DataFrame without using inplace:
stock_cleaned = stock_cleaned.dropna()

# To check for missing values again:
missing_values_after_cleanup = stock_cleaned.isna().sum()
missing_values_after_cleanup

In [None]:
# 2. convert to lowercase, strip and remove punctuations


testing_text="   This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs."

def preprocess(text):
    if not isinstance(text, str):
        return ""  # Return empty string if text is not a string

    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

preprocess(testing_text)

In [None]:
## Remove stopwords
nltk.download('stopwords')

def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

stopword(testing_text)


In [None]:
## Lemmatization 


# Downloading necessary NLTK data
nltk.download('averaged_perceptron_tagger')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert NLTK's POS tags to WordNet's format
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to lemmatize a sentence with POS tagging
def lemmatize_sentence_with_pos(sentence):
    # Tokenize the sentence into words
    tokens = word_tokenize(sentence)
    # Get POS tags for each token
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize each word with its POS tag
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags])
    return lemmatized_output

# Lemmatizing the test text with POS tagging
lemmatized_text = lemmatize_sentence_with_pos(testing_text)
print(lemmatized_text)


In [None]:
## Final pre-processing



#FINAL PREPROCESSING
# def finalpreprocess(string):
#     return lemmatizer(stopword(preprocess(string)))


def finalpreprocess(string):
    # Step 1: Preprocess the text
    preprocessed_text = preprocess(string)

    # Step 2: Remove stopwords
    text_without_stopwords = stopword(preprocessed_text)

    # Step 3: Lemmatize the text with POS tagging
    lemmatized_text = lemmatize_sentence_with_pos(text_without_stopwords)

    return lemmatized_text


In [None]:
stock_cleaned['clean_Sentence'] = stock_cleaned['Sentence'].apply(lambda x: finalpreprocess(x))
stock_cleaned.head()

## Word Embedding/Vectorization


In [None]:
"""Here use World2Vec because it better in capturing semantic info compared with 
BoW and TF-IDF, also this dataset is large enough for effective training."""

# I should've put in it in the very beginning
!pip install -U gensim
from gensim.models import Word2Vec


# Step 1: Tokenize the sentences (assuming sentences are already cleaned and are separated by spaces)
tokenized_sentences = [sentence.split() for sentence in stock_cleaned['clean_Sentence']]

# Step 2: Train the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Function to vectorize a sentence based on the Word2Vec model
def vectorize_sentence(sentence, model):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    # If the sentence is empty (no words found in the model), return a zero vector
    if not word_vectors:
        return np.zeros(model.vector_size)
    else:
        # Otherwise, return the mean of the word vectors
        return np.mean(word_vectors, axis=0)

# Step 4: Vectorize each sentence in the DataFrame
stock_vector = stock_cleaned.copy()
stock_vector['sentence_vector'] = stock_vector['clean_Sentence'].apply(lambda x: vectorize_sentence(x.split(), model))



In [None]:
# Check the first few rows of the dataframe to confirm the 'sentence_vector' column exists

stock_vector.head()
# Also I have upgraded in another notebook

In [None]:
## Also here I would like to try TF-IDF vectorization 

# stock_cleaned_tfidf = stock_cleaned.copy()

# # Initialize vectorization
# tfidf_vectorizer = TfidfVectorizer()

# # Create TF-IDF features
# tfidf_matrix = tfidf_vectorizer.fit_transform(stock_cleaned_tfidf['clean_Sentence'])

# # Get feature names for the columns
# try:
#     # Try using the newer attribute available from version 0.24
#     feature_names = tfidf_vectorizer.get_feature_names_out()
# except AttributeError:
#     # Fallback for older versions
#     feature_names = tfidf_vectorizer.get_feature_names()

# # Create a DataFrame with the TF-IDF features
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# # Concatenate the original DataFrame with the new TF-IDF features
# stock_cleaned_tfidf.reset_index(drop=True, inplace=True)
# stock_cleaned_tfidf = pd.concat([stock_cleaned_tfidf, tfidf_df], axis=1)



"""
My Kernel always die when run this 
too lazy to switch to a desktop
so I'll ignore this at this stage 
"""

## Split training data

In [None]:
# Split the sentence_vector into training data and testing data

X_train, X_val, y_train, y_val = train_test_split(stock_vector["sentence_vector"],
                                                  stock_vector["Sentiment"],
                                                  test_size=0.2,
                                                  shuffle=True)

In [None]:
X_train

# Seem it has been successfully transformed to vector

## ML Modelling 

In [None]:
# SVM

model_svm = svm.SVC(probability=True)


In [None]:
# Setup grid search parameter
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}
grid_search_svm = GridSearchCV(model_svm, param_grid, refit=True, verbose=2, cv=5, n_jobs=-1)

In [None]:
# Training and tuning hyperparameters
grid_search_svm.fit(X_train, y_train)