# Text Preprocessing

In [4]:
import requests
from bs4 import BeautifulSoup
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize

import ta  

In [1]:
# Function to scrape content from a given URL
def scrape_page(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    content = ""

    # Extract text from specific HTML tags
    for tag in ['h3', 'li', 'p', 'b']:
        for element in soup.find_all(tag):
            content += element.get_text(strip=True) + "\n"

    return content.strip()


In [None]:
# Function to fetch closing prices for a stock on a specific date
def fetch_closing_prices(stock_symbol, date):
    stock = yf.Ticker(stock_symbol)
    end_date = pd.to_datetime(date)
    if end_date.weekday() == 0:  # Monday
        start_date = end_date - pd.Timedelta(days=3)  # Friday
    else:
        start_date = end_date - pd.Timedelta(days=1)

    # Fetch previous and current day data
    previous_day_data = stock.history(start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))
    current_day_data = stock.history(start=end_date.strftime('%Y-%m-%d'), end=(end_date + pd.Timedelta(days=1)).strftime('%Y-%m-%d'))

    if previous_day_data.empty or current_day_data.empty:
        return None, None

    previous_close = previous_day_data.iloc[0]['Close']
    current_close = current_day_data.iloc[0]['Close']
    return previous_close, current_close


In [64]:
# Function to calculate RSI
def calculate_rsi(stock_data, period=14):
    try:
        rsi = ta.momentum.RSIIndicator(stock_data['Close'], window=period)
        return rsi.rsi().iloc[-1]  # Get the last value of RSI
    except Exception as e:
        print(f"Error calculating RSI: {e}")
        return None

In [65]:
# Function to calculate SMA
def calculate_sma(stock_data, period=20):
    try:
        sma = stock_data['Close'].rolling(window=period).mean()
        return sma.iloc[-1]  # Get the last value of SMA
    except Exception as e:
        print(f"Error calculating SMA: {e}")
        return None

In [66]:
# Function to fetch financial metrics (RSI and SMA) for a stock
def fetch_financial_metrics(stock_symbol, date):
    try:
        end_date = pd.to_datetime(date)
        start_date = end_date - pd.Timedelta(days=200)  # Sufficient data for SMA and RSI
        stock_data = yf.download(stock_symbol, start=start_date.strftime('%Y-%m-%d'), end=end_date.strftime('%Y-%m-%d'))
        if stock_data.empty:
            return None, None
        rsi = calculate_rsi(stock_data)
        sma = calculate_sma(stock_data)
        return rsi, sma
    except Exception as e:
        print(f"Error fetching data for {stock_symbol} on {date}: {e}")
        return None, None

In [67]:
# Function to determine stock movement (up or down)
def get_stock_movement(previous_close, current_close):
    if current_close > previous_close:
        return 'up'
    elif current_close < previous_close:
        return 'down'
    else:
        return None

In [5]:
# Load the cleaned DataFrame
subset_df = pd.read_excel('raw_analyst_ratings.xlsx')

In [6]:
# Select only relevant columns and convert date column to datetime
subset_df_cleaned = subset_df[['headline', 'url', 'publisher', 'date', 'stock']]
subset_df_cleaned['date'] = pd.to_datetime(subset_df_cleaned['date'], errors='coerce')
subset_df_cleaned = subset_df_cleaned.dropna(subset=['date'])
subset_df_cleaned['date'] = subset_df_cleaned['date'].apply(lambda x: x.replace(tzinfo=None).date() if isinstance(x, pd.Timestamp) else None)

# Filter for relevant date range
start_date = pd.to_datetime('2018-01-01').date()
end_date = pd.to_datetime('2024-12-31').date()
subset_df_cleaned = subset_df_cleaned[(subset_df_cleaned['date'] >= start_date) & (subset_df_cleaned['date'] <= end_date)]

# Drop rows with invalid dates again
subset_df_cleaned = subset_df_cleaned.dropna(subset=['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df_cleaned['date'] = pd.to_datetime(subset_df_cleaned['date'], errors='coerce')


In [7]:
subset_df_cleaned.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05,A
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03,A
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26,A
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22,A
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22,A


In [8]:
# Select unique stock and date combinations
unique_stock_dates = subset_df_cleaned[['stock', 'url', 'date']].drop_duplicates()

In [9]:
unique_stock_dates.head()

Unnamed: 0,stock,url,date
0,A,https://www.benzinga.com/news/20/06/16190091/s...,2020-06-05
1,A,https://www.benzinga.com/news/20/06/16170189/s...,2020-06-03
2,A,https://www.benzinga.com/news/20/05/16103463/7...,2020-05-26
3,A,https://www.benzinga.com/news/20/05/16095921/4...,2020-05-22
4,A,https://www.benzinga.com/news/20/05/16095304/b...,2020-05-22


In [10]:
# Filter for preselected stocks
preselected_stocks = ['AAPL', 'JNJ', 'PG', 'AMZN', 'PXD']
subset_df_sampled = unique_stock_dates[unique_stock_dates['stock'].isin(preselected_stocks)]
subset_df_sampled.head()

Unnamed: 0,stock,url,date
6680,AAPL,https://www.benzinga.com/government/20/06/1622...,2020-06-10
6681,AAPL,https://www.benzinga.com/analyst-ratings/price...,2020-06-10
6682,AAPL,https://www.benzinga.com/short-sellers/20/06/1...,2020-06-10
6683,AAPL,https://www.benzinga.com/news/20/06/16219873/d...,2020-06-10
6684,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,2020-06-10


In [None]:
# Scrape web content for the sampled stocks
subset_df_sampled['web_content'] = subset_df_sampled['url'].apply(scrape_page)
subset_df_sampled.head()

In [None]:
# Fetch closing prices and calculate movements
subset_df_sampled[['previous_close', 'closing_price']] = subset_df_sampled.apply(
    lambda row: fetch_closing_prices(row['stock'], row['date']), axis=1, result_type='expand')

In [76]:
subset_df_sampled.head()

Unnamed: 0,stock,url,date,web_content,previous_close,closing_price
6680,AAPL,https://www.benzinga.com/government/20/06/1622...,2020-06-10,Error fetching content,83.98645,86.147224
6681,AAPL,https://www.benzinga.com/analyst-ratings/price...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224
6682,AAPL,https://www.benzinga.com/short-sellers/20/06/1...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224
6683,AAPL,https://www.benzinga.com/news/20/06/16219873/d...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224
6684,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224


In [77]:
# Drop rows with missing previous_close data
subset_df_sampled = subset_df_sampled.dropna(subset=['previous_close'])
subset_df_sampled.head()

Unnamed: 0,stock,url,date,web_content,previous_close,closing_price
6680,AAPL,https://www.benzinga.com/government/20/06/1622...,2020-06-10,Error fetching content,83.98645,86.147224
6681,AAPL,https://www.benzinga.com/analyst-ratings/price...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224
6682,AAPL,https://www.benzinga.com/short-sellers/20/06/1...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224
6683,AAPL,https://www.benzinga.com/news/20/06/16219873/d...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224
6684,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224


In [78]:
# Calculate financial metrics (RSI and SMA)
subset_df_sampled[['RSI', 'SMA']] = subset_df_sampled.apply(
    lambda row: fetch_financial_metrics(row['stock'], row['date']), axis=1, result_type='expand')

# Drop rows with missing RSI or SMA data
subset_df_sampled = subset_df_sampled.dropna(subset=['RSI', 'SMA'])

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

In [79]:
subset_df_sampled.head()

Unnamed: 0,stock,url,date,web_content,previous_close,closing_price,RSI,SMA
6680,AAPL,https://www.benzinga.com/government/20/06/1622...,2020-06-10,Error fetching content,83.98645,86.147224,77.101694,79.899875
6681,AAPL,https://www.benzinga.com/analyst-ratings/price...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875
6682,AAPL,https://www.benzinga.com/short-sellers/20/06/1...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875
6683,AAPL,https://www.benzinga.com/news/20/06/16219873/d...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875
6684,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,2020-06-10,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875


In [80]:
# Merge with the original cleaned dataframe
merged_df = pd.merge(subset_df_cleaned, subset_df_sampled, on=['stock', 'date'], how='inner')

In [81]:
merged_df.head()

Unnamed: 0,headline,url_x,publisher,date,stock,url_y,web_content,previous_close,closing_price,RSI,SMA
0,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,2020-06-10,AAPL,https://www.benzinga.com/government/20/06/1622...,Error fetching content,83.98645,86.147224,77.101694,79.899875
1,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,2020-06-10,AAPL,https://www.benzinga.com/analyst-ratings/price...,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875
2,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,2020-06-10,AAPL,https://www.benzinga.com/short-sellers/20/06/1...,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875
3,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,2020-06-10,AAPL,https://www.benzinga.com/news/20/06/16219873/d...,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875
4,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,2020-06-10,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,The Only Monthly Membership With a Positive RO...,83.98645,86.147224,77.101694,79.899875


In [82]:
# Determine stock movement
merged_df['movement'] = merged_df.apply(lambda row: get_stock_movement(row['previous_close'], row['closing_price']), axis=1)

# Combine headline and web content into a new column 'Text'
merged_df['Text'] = merged_df['headline'] + "\n" + merged_df['web_content']

# Save the updated DataFrame
merged_df.to_excel('Processed_File.xlsx', index=False)

# Filter out rows without movement information
merged_df = merged_df.dropna(subset=['movement'])

In [83]:
# Count the occurrences of 'up' and 'down' in the 'movement' column
num_up = (merged_df['movement'] == 'up').sum()
num_down = (merged_df['movement'] == 'down').sum()
total = num_up + num_down

# Print the counts
print(f'Number of Total Movement: {total}')
print(f'Number of Up Movement: {num_up}')
print(f'Number of Down Movement: {num_down}')


Number of Total Movement: 15323
Number of Up Movement: 8318
Number of Down Movement: 7005


# Text Cleanning & Tokenization

In [84]:
def tokenize(text):
    """Tokenize the text into words."""
    return word_tokenize(text)

In [85]:
def remove_punctuation(tokens):
    """Remove punctuation tokens from a list of tokens."""
    return [token for token in tokens if token not in string.punctuation]

In [86]:
def remove_stopwords(tokens):
    """Remove stopwords from a list of tokens."""
    stopword_list = set(stopwords.words('english'))
    return [token for token in tokens if token.lower() not in stopword_list]

In [87]:
def perform_stemming(tokens):
    """Perform stemming on a list of tokens."""
    ps = PorterStemmer()
    return [ps.stem(token) for token in tokens]

In [88]:
def perform_lemmatization(tokens):
    """Perform lemmatization on a list of tokens."""
    wn = WordNetLemmatizer()
    return [wn.lemmatize(token, 'v') for token in tokens]

In [89]:
def stemming_lemmatization(tokens):
    """Perform stemming and lemmatization on a list of tokens."""
    tokens_stemmed = perform_stemming(tokens)
    tokens_lemmatized = perform_lemmatization(tokens_stemmed)
    return tokens_lemmatized

In [90]:
def clean_text(text):
    """Clean and preprocess a document."""
    tokens = tokenize(text)
    tokens_no_punctuation = remove_punctuation(tokens)
    tokens_no_stopwords = remove_stopwords(tokens_no_punctuation)
    tokens_processed = stemming_lemmatization(tokens_no_stopwords)
    return ' '.join(tokens_processed)

In [91]:
# Clean and preprocess the text data
cleaned_texts = [clean_text(text) for text in merged_df['Text']]
y = merged_df['movement'].astype(str)

In [92]:
len(cleaned_texts)

15323

In [93]:
len(y)

15323

In [94]:
# Vectorize the cleaned text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_texts)
tfidf_matrix

<15323x16470 sparse matrix of type '<class 'numpy.float64'>'
	with 1453826 stored elements in Compressed Sparse Row format>

In [95]:
# Dimensionality reduction using Truncated SVD
svd = TruncatedSVD(n_components=1500)
X_reduced = svd.fit_transform(tfidf_matrix)
X_reduced.data = np.clip(X_reduced.data, a_min=0, a_max=None)

  X_reduced.data = np.clip(X_reduced.data, a_min=0, a_max=None)


In [96]:
# Add financial features (RSI and SMA) to the reduced text features
financial_features = merged_df[['RSI', 'SMA']].values
X_combined = np.hstack([X_reduced, financial_features])

In [97]:
# Print explained variance ratio for TF-IDF features
print(f"Explained variance by 1500 components in TF-IDF: {np.sum(svd.explained_variance_ratio_)}")

Explained variance by 1500 components in TF-IDF: 0.9905638806228432


# Model Training and Evaluation


In [103]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=2)

In [106]:
# Number of folds
k = 5

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True, random_state=2)

In [109]:
# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True)
}

In [110]:
# Dictionary to hold cross-validation results
cv_results = {}

# Perform K-Fold Cross Validation for each model
for model_name, model in models.items():
    scores = cross_val_score(model, X_combined, y, cv=kf, scoring='accuracy')
    cv_results[model_name] = scores
    print(f"{model_name} - Mean Accuracy: {scores.mean()}, Standard Deviation: {scores.std()}")

# Example of accessing the results
for model_name, scores in cv_results.items():
    print(f"{model_name}: {scores}")

Logistic Regression - Mean Accuracy: 0.8162240660365706, Standard Deviation: 0.008961586603087787
Naive Bayes - Mean Accuracy: 0.5764532390034884, Standard Deviation: 0.009663718268349513
Random Forest - Mean Accuracy: 0.9788554129628289, Standard Deviation: 0.0032058040075617494
SVM - Mean Accuracy: 0.6731062616332807, Standard Deviation: 0.005105495047806261
Logistic Regression: [0.82414356 0.81957586 0.80358891 0.80776762 0.82604439]
Naive Bayes: [0.57650897 0.58401305 0.57585644 0.55907311 0.58681462]
Random Forest: [0.98270799 0.9771615  0.97520392 0.97650131 0.98270235]
SVM: [0.67797716 0.66655791 0.6675367  0.6785248  0.67493473]
