# EDA(Exploratory Data Analysis)

In [None]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import scatterplotmatrix
import numpy as np
from mlxtend.plotting import heatmap
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

nltk.download()

In [None]:
#Read The File
df = pd.read_csv('financial_data_cleaned.csv')

In [None]:
df.head()

In [None]:
#Pie chart
colors = ['#4F6272', '#B7C3F3', '#DD7596']
fig, (ax1) = plt.subplots(ncols=1, figsize=(10, 5))
df.Sentiment.value_counts().head(3).plot(kind='pie', labels=None, autopct='%.2f', ax=ax1, wedgeprops = { 'linewidth' : 1, 'edgecolor' : 'white' }, colors=colors).legend(labels={
                     "neutral",
                     "positive",
                     "negative"})
central_circle = plt.Circle((0, 0), 0.4, color='white')
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc('font', size=12)
plt.title('% of sentimets', size=15)
plt.tight_layout()
plt.savefig('images/donut_chart.png', dpi=300)
plt.show()

In [None]:
#Polarity scores
df["Polarity"] = df["Sentence"].map(lambda Text: TextBlob(Text).sentiment.polarity)
df["Polarity"].plot(kind = "hist", bins = 15, linewidth = 1, color = "salmon", figsize = (10,5))

plt.title("Polarity Score in Reviews", pad = 15)
plt.xlabel("Polarity", labelpad = 15)

plt.ylabel("Amount of Reviews", labelpad = 15)
plt.savefig('images/polarity_score.png', dpi=300)
plt.show()

In [None]:
#Length of reviews
df["Length"] = df["Sentence"].astype(str).apply(len)
df["Length"].plot(kind = "hist", bins = 15, linewidth = 1, color = "teal", figsize = (10,5))
plt.title("Length of Reviews", pad = 20)
plt.xlabel("Length", labelpad = 15)
plt.ylabel("Amount of Reviews",labelpad = 20)
plt.savefig('images/length_of_reviews.png', dpi=300)
plt.show()

In [None]:
#Word counts
df["Word Counts"] = df["Sentence"].apply(lambda x: len(str(x).split()))
df["Word Counts"].plot(kind = "hist", bins = 15, linewidth = 1, color = "plum", figsize = (10,5))
plt.title("Word Counts in Reviews", pad = 20)
plt.xlabel("Word Counts", labelpad = 15)
plt.ylabel("Amount of Reviews", labelpad = 20)
plt.savefig('images/word_counts.png', dpi=300)
plt.show()

In [None]:
#WordCloud
Stopwords = set(nltk.corpus.stopwords.words("english")) - set(["not"])
wc= WordCloud(background_color="white",random_state=1, max_words=2000, width = 3000, height = 1500, stopwords = Stopwords).generate(str(df["Sentence"]))
wc.generate(text)
plt.figure(figsize = (15, 15))
plt.imshow(wc, interpolation = "bilinear")
plt.axis("off")
plt.savefig('images/wordcloud.png', dpi=300)
plt.show()

In [None]:
#N-Gram Analysis
def Gram_Analysis(Corpus, Gram, N):
    
  # Vectorizer
  Vectorizer = CountVectorizer(stop_words = Stopwords, ngram_range=(Gram,Gram))

  # N-Grams Matrix
  ngrams = Vectorizer.fit_transform(Corpus)

  # N-Grams Frequency
  Count = ngrams.sum(axis=0)

  # List of Words
  words = [(word, Count[0, idx]) for word, idx in Vectorizer.vocabulary_.items()]

  # Sort Descending With Key = Count
  words = sorted(words, key = lambda x:x[1], reverse = True)
    
  return words[:N]

In [None]:
#Unigrams
# Finding 2-gram
df_positive = df[df["Sentiment"] == "positive"].dropna()
words_p = Gram_Analysis(df_positive["Sentence"], 2, 5)
Bigram = pd.DataFrame(words_p, columns = ["Words", "Counts"])

# Visualization
color_p=['#B7C3F3']
Bigram.groupby("Words").sum()["Counts"].sort_values().plot(kind = "barh", color = color_p, figsize = (10, 5))
plt.title("2-gram of Reviews with Positive Sentiments", loc = "center", fontsize = 15, pad = 25)
plt.xlabel("Total Counts", fontsize = 15, labelpad = 20)
plt.xticks(rotation = 0)
plt.ylabel("Top Words", fontsize = 15, labelpad = 20)
plt.savefig('images/p_2gram.png', dpi=300)
plt.show()

In [None]:
# Finding 2-gram
df_negative = df[df["Sentiment"] == "negative"].dropna()
words_n = Gram_Analysis(df_negative["Sentence"], 2, 5)
Bigram = pd.DataFrame(words_n, columns = ["Words", "Counts"])

# Visualization
color_n = ['#DD7596']
Bigram.groupby("Words").sum()["Counts"].sort_values().plot(kind = "barh", color = color_n, figsize = (10, 5))
plt.title("2-gram of Reviews with Negative Sentiments", loc = "center", fontsize = 15, pad = 25)
plt.xlabel("Total Counts", fontsize = 15, labelpad = 20)
plt.xticks(rotation = 0)
plt.ylabel("Top Words", fontsize = 15, labelpad = 20)
plt.savefig('images/n_2gram.png', dpi=300)
plt.show()

In [None]:
# Finding 2-gram
df_neutral = df[df["Sentiment"] == "neutral"].dropna()
words_nl = Gram_Analysis(df_neutral["Sentence"], 2, 5)
Bigram = pd.DataFrame(words_nl, columns = ["Words", "Counts"])

# Visualization
color_nl = ['#4F6272']
Bigram.groupby("Words").sum()["Counts"].sort_values().plot(kind = "barh", color = color_nl, figsize = (10, 5))
plt.title("2-gram of Reviews with Neutral Sentiments", loc = "center", fontsize = 15, pad = 25)
plt.xlabel("Total Counts", fontsize = 15, labelpad = 20)
plt.xticks(rotation = 0)
plt.ylabel("Top Words", fontsize = 15, labelpad = 20)
plt.savefig('images/nl_2gram.png', dpi=300)
plt.show()

# Model Building

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
#pull the data
df = pd.read_csv('financial_data_cleaned.csv')

In [None]:
#Feature Engineering
#Encoding Sentiment variable
Encoder = LabelEncoder() 
df["Sentiment"] = Encoder.fit_transform(df["Sentiment"])  
df["Sentiment"].value_counts()

In [None]:
#Term Frequency - Inverse Document Frequency (TF-IDF) Vectorizer
tfidf = TfidfVectorizer(max_features = 5000, ngram_range = (2, 2))
X = tfidf.fit_transform(df["Sentence"])
X.shape

In [None]:
y = df['Sentiment']
y

In [None]:
#Balance the imbalanced dataset
Counter(y)

In [None]:
Balancer = SMOTE(random_state = 42)
X_final, y_final = Balancer.fit_resample(X, y)

In [None]:
Counter(y_final)

In [None]:
#Pie chart
colors = ['#4F6272', '#B7C3F3', '#DD7596']
fig, (ax1) = plt.subplots(ncols=1, figsize=(10, 5))
y_final.value_counts().head(3).plot(kind='pie', labels=None, autopct='%.2f', ax=ax1, wedgeprops = { 'linewidth' : 1, 'edgecolor' : 'white' }, colors=colors).legend(labels={
                     "neutral":"1",
                     "positive":"2",
                     "negative":"0"})
central_circle = plt.Circle((0, 0), 0.4, color='white')
fig = plt.gcf()
fig.gca().add_artist(central_circle)
plt.rc('font', size=12)
plt.title('% of sentimets after resampling', size=15)
plt.tight_layout()
plt.savefig('images/donut_chart_balanced_data.png', dpi=300)
plt.show()

In [None]:
#Model Selection
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.20, random_state = 42)

In [None]:
#Model building
dt = DecisionTreeClassifier()
lr = LogisticRegression()
SVC = SVC()
rf = RandomForestClassifier()
Bayes = BernoulliNB()
KNN = KNeighborsClassifier()

Models = [dt, lr, SVC, rf, Bayes, KNN]
Models_Dict = {0: "Decision Tree", 1: "Logistic Regression", 2: "SVC", 3: "Random Forest", 4: "Naive Bayes", 5: "K-Neighbors"}

for i, model in enumerate(Models):
  print("{} Test Accuracy: {}".format(Models_Dict[i], cross_val_score(model, X, y, cv = 10, scoring = "accuracy").mean()))

In [None]:
#Hyperparamater Tuning
Param = {"C": np.logspace(-4, 4, 50), "penalty": ['l1', 'l2']}
grid_search = GridSearchCV(estimator = LogisticRegression(random_state = 42), param_grid = Param, scoring = "accuracy", cv = 10, verbose = 0, n_jobs = -1)

grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

In [None]:
#Best model
Classifier = LogisticRegression(random_state = 42, C = 6866.488450042998, penalty = 'l2')
Classifier.fit(X_train, y_train)

Prediction = Classifier.predict(X_test)

In [None]:
#Metrics
accuracy_score(y_test, Prediction)

In [None]:
#Confusion Matrix
ConfusionMatrix = confusion_matrix(y_test, Prediction)

In [None]:
# Plotting Function for Confusion Matrix
colors = ['#4F6272', '#B7C3F3', '#DD7596']

def plot_cm(cm, classes, title, normalized = False, cmap = plt.cm.BuPu):
    plt.imshow(cm, interpolation = "nearest", cmap = cmap)
    plt.title(title, pad = 20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    

    if normalized:
        cm = cm.astype('float') / cm.sum(axis = 1)[: np.newaxis]
        print("Normalized Confusion Matrix")
    else:
        print("Unnormalized Confusion Matrix")
  
    threshold = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], horizontalalignment = "center", color = "white" if cm[i, j] > threshold else "black")

    plt.tight_layout()
    plt.xlabel("Predicted Label", labelpad = 20)
    plt.ylabel("Real Label", labelpad = 20)

In [None]:
plot_cm(ConfusionMatrix, classes = ["Positive", "Neutral", "Negative"], title = "Confusion Matrix of Sentiment Analysis")
plt.tight_layout()
plt.savefig('images/confusion_matrix.png', dpi=300)

print(classification_report(y_test, Prediction))
