In [1]:

#for text pre-processing
import re, string
%pip install nltk
# if run in VS, use %; if run in Jupter, delete the line above


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import csv
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# SVM ML
from sklearn import svm
from sklearn.model_selection import GridSearchCV

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
stock_raw = pd.read_csv('sentiment_stock_data.csv',index_col=[0])

In [3]:
# Class distribution 
value_counts = stock_raw['Sentiment'].value_counts()
value_counts

1    55725
0    53026
Name: Sentiment, dtype: int64

In [4]:
# Number of words in a news
def count_unique_words(sentence):
    if isinstance(sentence, str):  # Check if the value is a string
        # Split the sentence into words, convert to a set to remove duplicates, and count
        return len(set(sentence.split()))
    else:
        # If the value is not a string, return 0
        return 0

# Count the unique words in each row of the 'Sentence' column and create a new column with the counts
stock_raw['Vocabulary_Count'] = stock_raw['Sentence'].apply(count_unique_words)

# Display the DataFrame with the new 'Vocabulary_Count' column
print(stock_raw[['Sentence', 'Vocabulary_Count']])

                                                 Sentence  Vocabulary_Count
0       According to Gran , the company has no plans t...                18
1       For the last quarter of 2010 , Componenta 's n...                30
2       In the third quarter of 2010 , net sales incre...                22
3       Operating profit rose to EUR 13.1 mn from EUR ...                20
4       Operating profit totalled EUR 21.1 mn , up fro...                18
...                                                   ...               ...
111290  Philippines president Rodrigo Duterte urges pe...                29
111291  Spain arrests three Pakistanis accused of prom...                 8
111292  Venezuela, where anger over food shortages is ...                39
111293  A Hindu temple worker has been killed by three...                42
111294  Ozone layer hole seems to be healing - US &amp...                49

[108751 rows x 2 columns]


In [5]:

# Calculate the mean, median, and max & min of the word count
mean_count = stock_raw['Vocabulary_Count'].mean()
median_count = stock_raw['Vocabulary_Count'].median()
max_count = stock_raw['Vocabulary_Count'].max()
min_count = stock_raw['Vocabulary_Count'].min()

# Display the calculated statistics
print(f"Mean word count: {mean_count}")
print(f"Median word count: {median_count}")
print(f"Maximum word count: {max_count}")
print(f"Minimum word count: {min_count}")


Mean word count: 11.909462901490562
Median word count: 9.0
Maximum word count: 53
Minimum word count: 0


## Text pre-processing

In [6]:
##  Text cleaning
# 1. Clean missing values 

stock_raw.isna().sum()


Sentiment           0
Sentence            1
Vocabulary_Count    0
dtype: int64

In [7]:
# To drop rows with N/A values and update the DataFrame in place:
stock_cleaned = stock_raw.copy()
stock_cleaned.dropna(inplace=True)

# To assign the result to the same or a new DataFrame without using inplace:
stock_cleaned = stock_cleaned.dropna()

# To check for missing values again:
missing_values_after_cleanup = stock_cleaned.isna().sum()
missing_values_after_cleanup

Sentiment           0
Sentence            0
Vocabulary_Count    0
dtype: int64

In [8]:
# 2. convert to lowercase, strip and remove punctuations


testing_text="   This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs."

def preprocess(text):
    if not isinstance(text, str):
        return ""  # Return empty string if text is not a string

    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

preprocess(testing_text)

'this is a message to be cleaned it may involve some things like adjacent spaces and tabs'

In [9]:
## Remove stopwords
nltk.download('stopwords')

def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

stopword(testing_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yifanwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"This message cleaned. It may involve things like: <br>, ?, :, '' adjacent spaces tabs."

In [12]:
## 3. Lemmatization 


# Downloading necessary NLTK data
nltk.download('averaged_perceptron_tagger')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert NLTK's POS tags to WordNet's format
# Covert adj, adv, noun and verb 
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to lemmatize a sentence with POS tagging
def lemmatize_sentence_with_pos(sentence):
    # Tokenize the sentence into words
    tokens = word_tokenize(sentence)
    # Get POS tags for each token
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize each word with its POS tag
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags])
    return lemmatized_output

# Lemmatizing the test text with POS tagging
lemmatized_text = lemmatize_sentence_with_pos(testing_text)
print(lemmatized_text)


This be a message to be clean . It may involve some thing like : < br > , ? , : , `` adjacent space and tab .


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yifanwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
## Final pre-processing



#FINAL PREPROCESSING
# def finalpreprocess(string):
#     return lemmatizer(stopword(preprocess(string)))


def finalpreprocess(string):
    # Step 1: Preprocess the text
    preprocessed_text = preprocess(string)

    # Step 2: Remove stopwords
    text_without_stopwords = stopword(preprocessed_text)

    # Step 3: Lemmatize the text with POS tagging
    lemmatized_text = lemmatize_sentence_with_pos(text_without_stopwords)

    return lemmatized_text


In [14]:
stock_cleaned['clean_Sentence'] = stock_cleaned['Sentence'].apply(lambda x: finalpreprocess(x))
stock_cleaned.head()

Unnamed: 0,Sentiment,Sentence,Vocabulary_Count,clean_Sentence
0,0,"According to Gran , the company has no plans t...",18,accord gran company plan move production russi...
1,1,"For the last quarter of 2010 , Componenta 's n...",30,last quarter componenta net sale double eur eu...
2,1,"In the third quarter of 2010 , net sales incre...",22,third quarter net sale increase eur mn operati...
3,1,Operating profit rose to EUR 13.1 mn from EUR ...,20,operating profit rise eur mn eur mn correspond...
4,1,"Operating profit totalled EUR 21.1 mn , up fro...",18,operating profit total eur mn eur mn represent...


## Word Embedding/Vectorization


In [15]:
"""Here use World2Vec because it better in capturing semantic info compared with 
BoW and TF-IDF, also this dataset is large enough for effective training."""

# I should've put in it in the very beginning
!pip install -U gensim
from gensim.models import Word2Vec


# Step 1: Tokenize the sentences (assuming sentences are already cleaned and are separated by spaces)
tokenized_sentences = [sentence.split() for sentence in stock_cleaned['clean_Sentence']]

# Step 2: Train the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Function to vectorize a sentence based on the Word2Vec model
def vectorize_sentence(sentence, model):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    # If the sentence is empty (no words found in the model), return a zero vector
    if not word_vectors:
        return np.zeros(model.vector_size)
    else:
        # Otherwise, return the mean of the word vectors
        return np.mean(word_vectors, axis=0)

# Step 4: Vectorize each sentence in the DataFrame
stock_vector = stock_cleaned.copy()
stock_vector['sentence_vector'] = stock_vector['clean_Sentence'].apply(lambda x: vectorize_sentence(x.split(), model))



[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [16]:
# Check the first few rows of the dataframe to confirm the 'sentence_vector' column exists

stock_vector.head()
# Also I have upgraded in another notebook

Unnamed: 0,Sentiment,Sentence,Vocabulary_Count,clean_Sentence,sentence_vector
0,0,"According to Gran , the company has no plans t...",18,accord gran company plan move production russi...,"[-0.5662401, 0.5228299, 0.47695407, -0.3032332..."
1,1,"For the last quarter of 2010 , Componenta 's n...",30,last quarter componenta net sale double eur eu...,"[-0.58484614, 1.1203917, 1.2305793, -0.3818204..."
2,1,"In the third quarter of 2010 , net sales incre...",22,third quarter net sale increase eur mn operati...,"[-0.7916815, 1.5752168, 1.726137, -0.5110152, ..."
3,1,Operating profit rose to EUR 13.1 mn from EUR ...,20,operating profit rise eur mn eur mn correspond...,"[-0.6321121, 1.4723935, 1.6086259, -0.3901881,..."
4,1,"Operating profit totalled EUR 21.1 mn , up fro...",18,operating profit total eur mn eur mn represent...,"[-0.73849535, 1.550453, 1.7565956, -0.45242414..."


In [None]:
## Also here I would like to try TF-IDF vectorization 

# stock_cleaned_tfidf = stock_cleaned.copy()

# # Initialize vectorization
# tfidf_vectorizer = TfidfVectorizer()

# # Create TF-IDF features
# tfidf_matrix = tfidf_vectorizer.fit_transform(stock_cleaned_tfidf['clean_Sentence'])

# # Get feature names for the columns
# try:
#     # Try using the newer attribute available from version 0.24
#     feature_names = tfidf_vectorizer.get_feature_names_out()
# except AttributeError:
#     # Fallback for older versions
#     feature_names = tfidf_vectorizer.get_feature_names()

# # Create a DataFrame with the TF-IDF features
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# # Concatenate the original DataFrame with the new TF-IDF features
# stock_cleaned_tfidf.reset_index(drop=True, inplace=True)
# stock_cleaned_tfidf = pd.concat([stock_cleaned_tfidf, tfidf_df], axis=1)



"""
My Kernel always die when run this 
so I'll ignore this at this stage 
"""
'''The Kernel crashed while executing code in the the current cell or a previous cell. 
Please review the code in the cell(s) to identify a possible cause of the failure. 
Click here for more info. View Jupyter log for further details.'''

: 

## Split training data

In [17]:
# Split the sentence_vector into training data and testing data

X_train, X_val, y_train, y_val = train_test_split(stock_vector["sentence_vector"],
                                                  stock_vector["Sentiment"],
                                                  test_size=0.2,
                                                  shuffle=True)

In [18]:
X_train

# Seem it has been successfully transformed to vector

50270    [-0.19612427, 0.14071377, 0.2851384, 0.1885395...
19454    [-0.4089507, 0.6361304, 0.43058932, 0.20455606...
91022    [-0.16773327, 0.83378774, 0.22178309, -0.10981...
75853    [-0.34125158, 0.2601253, 0.3239526, 0.03328187...
68349    [-0.1710679, 0.3250221, 0.28081483, 0.17759454...
                               ...                        
60967    [-0.36933428, 0.46731466, 0.3328466, 0.0649987...
69399    [-0.49670428, 0.48818114, 0.626523, 0.12714852...
71319    [-0.17851555, 0.47110817, 0.54173595, 0.015697...
89855    [-0.07858951, 0.9477223, 0.36553153, 0.4670431...
1525     [-0.33684042, 0.31553972, 0.21755019, -0.01142...
Name: sentence_vector, Length: 87000, dtype: object

## ML Modelling 

In [19]:
# SVM

model_svm = svm.SVC(probability=True)


In [20]:
# User has asked to type out the code from the image provided into the code box.

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.model_selection import GridSearchCV



In [21]:
from sklearn.svm import SVC

# Define the SVM model
model_svm = SVC()


In [22]:
# Setup grid search parameter
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}
grid_search_svm = GridSearchCV(model_svm, param_grid, refit=True, error_score='raise', verbose=2, cv=5, n_jobs=-1)

In [None]:
# Training and tuning hyperparameters
grid_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, 

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
TypeError: only size-1 arrays can be converted to Python scalars

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 190, in fit
    X, y = self._validate_data(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1146, in check_X_y
    X = check_array(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/opt/anaconda3/lib/python3.9/site-packages/pandas/core/series.py", line 857, in __array__
    return np.asarray(self._values, dtype)
ValueError: setting an array element with a sequence.


In [24]:
# Save model to joblib
import joblib
from joblib import dump, load

best_svm = grid_search_svm.best_estimator_ 


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [25]:
best_svm

NameError: name 'best_svm' is not defined

In [None]:
y_pred_svm = best_svm.predict(X_test)

In [None]:
# Save the trained model to .joblinb
joblib.dump(best_svm, 'model_svm_clothes.joblib')

In [25]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [None]:
print(classification_report(y_test, y_pred_svm))

In [None]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
cm_svm = confusion_matrix(y_test, y_pred_svm)


In [None]:
print('accuracy:', accuracy_svm)
print('precision:', precision_svm)
print('recall:', recall_svm)
print('f1 score:', f1_svm)


In [None]:
# Show confusion matrix of SVM model
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
