# Multi-Model Sentiment Analysis Prototype
## Jonathan Visona
### CPSC 8985-02 - FA2024

### ML References

In [None]:
# https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=notebooks-markdown-jupyter-cheatsheet

## Imports

In [None]:
"""
visona-sentiment-analysis-prototype.ipynb
Jonathan Visona
CPSC 8985-02 - FA2024
"""

# utility imports
    # https://docs.python.org/3/c-api/index.html

#import string # unused?
from string import punctuation as STR_punctuation
import re as REGEX
import warnings as WARN

# visualization imports
    # https://matplotlib.org/stable/api/index.html
    # https://seaborn.pydata.org/api.html

import matplotlib.pyplot as MPLTLIB_pyplt
#import seaborn as sb # affects some aesthetic defaults of matplotlib without invocation

# math and data science imports
    # https://numpy.org/doc/2.1/reference/index.html
    # https://pandas.pydata.org/docs/reference/index.html

#import numpy as np # included for expansible functionality
import pandas as PNDS

# nltk imports
    # https://www.nltk.org/api/nltk.html

from nltk.tokenize import word_tokenize as NLTK_word_tokenize
from nltk.stem import LancasterStemmer as NLTK_lancaster_stemer
from nltk.corpus import stopwords as NLTK_stopwords
from nltk.stem.wordnet import WordNetLemmatizer as NLTK_wordnet_lemmatizer
from nltk.probability import FreqDist as NLTK_freq_dist

# sci-kit imports
    # https://scikit-learn.org/stable/api/index.html

#from sklearn.metrics import accuracy_score # unused
from sklearn.metrics import accuracy_score as SKLN_accuracy_score
from sklearn.metrics import classification_report as SKLN_classification_report
from sklearn.metrics import ConfusionMatrixDisplay as SKLN_confusion_matrix_display
from sklearn.model_selection import train_test_split as SKLN_train_test_split
from sklearn.linear_model import LogisticRegression as SKLN_logistic_regression
from sklearn.ensemble import RandomForestClassifier as SKLN_random_forest_classifier
from sklearn.feature_extraction.text import TfidfVectorizer as SKLN_tfidf_vectorizer
from sklearn.tree import DecisionTreeClassifier as SKLN_decision_tree_classifier

## Global Data

In [None]:
# This data is provided by Kaggle at https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset
WARN_OFF = False # Turn off after finished debugging
if( WARN_OFF ):
    WARN.filterwarnings( 'ignore' )

ENCODING = 'latin1'
REL_PATH = '../data/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
MAX_COLS = 10
MAX_ROWS = 3_000_000
HEAD_SIZE = 25

PNDS.set_option( 'display.max_columns', MAX_COLS )
PNDS.set_option( 'display.max_rows', MAX_ROWS )

## Training and Testing Data

In [None]:
# Build training and test data from SA Kaggle data
training_data = PNDS.read_csv( REL_PATH + TRAIN_FILE, encoding=ENCODING );
test_data = PNDS.read_csv( REL_PATH + TEST_FILE, encoding=ENCODING );

In [None]:
# combine the training and teseting data
dframe = PNDS.concat( [ training_data, test_data ] )

In [None]:
# Show the top rows of the data
dframe.head( n=HEAD_SIZE )

In [None]:
# Provide general information on current dataset
dframe.info( verbose=True )

## Primary Dataframe Construction and Development

In [None]:
# Drop irrelevant data from dframe
dframe = dframe.drop( columns=[ 'textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)' ] )

In [None]:
# Solve gaps in data by dropping NaNs on current Dataframe with counts pre- and post-process
nan_count_per_column = dframe.isna().sum()
print( nan_count_per_column )

In [None]:
# Drop NaNs and revist data
# dframe.dropna( inplace=True )
nan_count_per_column = dframe.isna().sum()
print(nan_count_per_column)

In [None]:
# peek at current cols
column_list = dframe.columns.tolist()
print( column_list )


# Data Conditioning

### Regex

In [None]:
# https://docs.python.org/3/library/re.html
# clean up text to deal with links, non-alphanumerics, URLs, whitespace, etc.
def alphanumericize( txt: str ) -> str:
    txt = str( txt )
    try: 
        pattern_replacement_pairs = {
            r'<.*?>': '',
            r'[^a-zA-Z0-9\s]': '',
            r'https?://\S+|www\.\S+': '',
            r'\n': '',
            r'[%s]' % REGEX.escape( STR_punctuation ): '',
            r'\s+': ' ',
            r'\w*\d\w*': ''
        }
        for pattern, replacement in pattern_replacement_pairs.items():
            txt = REGEX.sub( pattern, replacement, txt.strip() )
            #DEBUG print( f"{type(txt)=} {txt=}" )
        return txt
    except Exception as e:
        print( f"Error alphanumericizing: {e}" )
        return ""
    return 
dframe[ 'alphanumeric' ] = dframe[ 'text' ].apply( alphanumericize )

### Tokenization

In [None]:
# tokenize the sentences so they can be processed further in the application
def tokenize( txt:str ) -> list:
    try:
        return NLTK_word_tokenize( str( txt ) )
    except Exception as e:
        print( f"Error tokenizing: {e}" )
        return ""
dframe[ 'tokens' ] = dframe[ 'text' ].apply( tokenize )

In [None]:
# Take a look at the results of tokenization
print( dframe[ 'tokens' ] )

#NB currently contains , '*', fragment of URL, breaks up contractions (I've, I'd, couldn't, etc.), shows NaNs

### Normalization

In [None]:
# Should this be combined with primary regex activity?
# convert to lowercase and remove redundant space around tokens
def normalize( txt: str ) -> str:
    txt = str( txt )
    try:
        txt = REGEX.sub(r'[^\w\s]', '', txt.lower() )
        txt = REGEX.sub(r'\s+', ' ', txt).strip()
    except Exception as e:
        print( f"Error normalizing: {e}" )
        return ""
    return txt 
dframe['normalized_text'] = dframe['text'].apply( normalize )

### Stopword Removal

In [None]:
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://en.wikipedia.org/wiki/Stop_word
# Remove stopwords using a comprehension by checking the nltk corpus for stopwords
def remove_stopwords( txt: str ) -> str:
    txt = str( txt )
    try:
        words = txt.split()      
        filtered_words = [ word for word in words if word.lower() not in NLTK_stopwords.words( 'english' ) ]
        filtered_text = ' '.join( filtered_words )
    except Exception as e:
        print( f"Error removing stopwords: {e}" )
        return ""
    return filtered_text
dframe[ 'no_stopwords' ] = dframe[ 'text' ].apply( remove_stopwords )

### Lancaster Stemming

In [None]:
# https://www.nltk.org/api/nltk.stem.lancaster.html
# https://en.wikipedia.org/wiki/Stemming
# use Lancaster stemmer
stuff_to_be_removed = list( NLTK_stopwords.words( 'english' ) ) + list( STR_punctuation )
stemmer = NLTK_lancaster_stemer()
text_list = dframe[ 'text' ].tolist()
print( len( text_list ) )
for each_txt in text_list:
    print( each_txt )

## Data Conditioning Evaluation

In [None]:
# provide memory usage of columns
dframe.memory_usage()

In [None]:
 # Look at bar chart see counts as well as relative size
dframe[ 'sentiment' ].value_counts().plot( kind='bar' )

In [None]:
# Look at a pie graph to get relative size of three polarities
dframe[ 'sentiment' ].value_counts().plot( kind='pie' );

In [None]:
# Examine the tally of neutral, positive, and negative sentiments
dframe[ 'sentiment' ].value_counts()

In [None]:
# Show word frequency data
word_frq = NLTK_freq_dist( NLTK_word_tokenize( ' '.join(dframe[ 'sentiment' ] ) ) )
MPLTLIB_pyplt.figure( figsize=( 10, 6 ) )
word_frq.plot( 20, cumulative=False )
MPLTLIB_pyplt.title( 'Word Frequency Distribution' )
MPLTLIB_pyplt.xlabel( 'Word')
MPLTLIB_pyplt.ylabel( 'Frequency' )
MPLTLIB_pyplt.show()

In [None]:
# Why another DataFrame???
final_corpus = dframe[ 'text' ].astype( str ).tolist()
data_eda = PNDS.DataFrame()
data_eda[ 'text' ] = final_corpus
data_eda[ 'sentiment' ] = dframe[ 'sentiment' ].values
data_eda.head()

## Train and Test Split Model

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split
# https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets
X = dframe[ 'selected_text' ]
Y = dframe[ 'sentiment' ]
X_train, X_test, Y_train, Y_test = SKLN_train_test_split( X, Y, test_size=0.2, random_state=42 )

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# https://en.wikipedia.org/wiki/Tf%E2%80%93idf
vectorizer = SKLN_tfidf_vectorizer()
XV_train = vectorizer.fit_transform( X_train )
XV_test = vectorizer.transform( X_test )

In [None]:
# https://scikit-learn.org/1.5/modules/model_evaluation.html
score_baseline = dframe[ 'sentiment' ].value_counts( normalize=True ).max()
#print( score_baseline )

## Logistic Regression Model

In [None]:
# https://scikit-learn.org/1.5/modules/linear_model.html#logistic-regression
# https://en.wikipedia.org/wiki/Logistic_regression
log_reg = SKLN_logistic_regression( n_jobs=-1 )
log_reg.fit( XV_train,Y_train )
pred_log_reg = log_reg.predict( XV_test )
score_log_reg = SKLN_accuracy_score( Y_test, pred_log_reg )
print( score_log_reg )

### Classification Report

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
# https://en.wikipedia.org/wiki/Statistical_classification
# Y test and logistic regression
print( SKLN_classification_report( Y_test, pred_log_reg ) )

### Confusion Matrix Display

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html#sklearn.metrics.ConfusionMatrixDisplay
# https://en.wikipedia.org/wiki/Confusion_matrix
# Confusion matrix with logicistic regression
SKLN_confusion_matrix_display.from_predictions( Y_test, pred_log_reg )

## Decision Tree Classifier Model

In [None]:
# https://scikit-learn.org/1.5/modules/tree.html
# https://en.wikipedia.org/wiki/Decision_tree_learning
decsn_tree = SKLN_decision_tree_classifier()
decsn_tree.fit( XV_train, Y_train )
pred_decsn_tree = decsn_tree.predict( XV_test )
score_decsn_tree = decsn_tree.score( XV_test, Y_test )
print( score_decsn_tree )

### Classification Report

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
# https://en.wikipedia.org/wiki/Statistical_classification
# Y test and decision tree
print( SKLN_classification_report( Y_test, pred_decsn_tree ) )

### Confusion Matrix Display

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html#sklearn.metrics.ConfusionMatrixDisplay
# https://en.wikipedia.org/wiki/Confusion_matrix
# Confusion matrix with logicistic regression
SKLN_confusion_matrix_display.from_predictions( Y_test, pred_decsn_tree )

## Random Forest classifier Model

In [None]:
#
rnd_forst_class = SKLN_random_forest_classifier( random_state=0 )
rnd_forst_class.fit( XV_train, Y_train )
pred_rnd_forst_class = rnd_forst_class.predict( XV_test )
score_rnd_forst_class = rnd_forst_class.score( XV_test, Y_test )
print( score_rnd_forst_class )

In [None]:
#
print( SKLN_classification_report( Y_test, pred_rnd_forst_class) ) # necessary use print?

In [None]:
# 
SKLN_confusion_matrix_display.from_predictions( Y_test, pred_rnd_forst_class )

## Cross-Model Evaluation

In [None]:
# Compare the scores across various models starting with logistic regression
print( f'Baseline model = {score_baseline}\n',
        f'Logistic regression = {score_log_reg}\n',
        f'Decision Tree Classification = {score_decsn_tree}\n',
        f'Random Forest Classifier = {score_rnd_forst_class}' )

## Test and Validation Sets on Models

In [None]:
# Make human-friendly
def label_sentiment(n):
    if n==0:
        return "negative"
    elif n==1:
        return "neutral"
    elif n==2:
        return "positive"
    else:
        assert n > 3, "illegal sentiment classification"
    
def test_item( new_items ):
    #testing_new_items = { 'text': [ new_items ] }
    new_def_test = PNDS.DataFrame( { 'text': [ new_items ] } )
    new_def_test[ 'text' ] = new_def_test[ 'text' ].apply( label_sentiment ) 
    new_x_test = new_def_test[ 'text' ]
    new_xv_test = vectorizer.transform( new_x_test )
    pred_log_reg = log_reg.predict( new_xv_test )
    pred_decsn_tree = decsn_tree.predict( new_xv_test ) # unused
    pred_rnd_forst_class = rnd_forst_class.predict( new_xv_test ) # unused

    return print( ( label_sentiment( pred_log_reg[1] ) ) )
    # what about pdr and rfc?

In [None]:
# utility function
def to_upper( txt: str ) -> str:
    return txt.upper()

def test_item( new_items ):
    #testing_new_items = { "text": [ new_items ] }
    new_def_test = PNDS.DataFrame( { "text": [ new_items ] } )
    new_def_test[ 'upper' ] = new_def_test[ 'text' ].apply( to_upper ) 
    new_x_test = new_def_test[ 'upper' ]
    new_xv_test = vectorizer.transform( new_x_test )
    return new_xv_test

print( test_item( "I am happy" ) )