In [None]:
"""
visona-sentiment-analysis-prototype.ipynb
Jonathan Visona
CPSC 8985-02 - FA2024
"""

# utility imports
    # https://docs.python.org/3/c-api/index.html

#import string # unused?
from string import punctuation as STR_punctuation
import re as REGEX
import warnings as WARN

# visualization imports
    # https://matplotlib.org/stable/api/index.html

import matplotlib.pyplot as MPLTLIB_pyplt
    # https://seaborn.pydata.org/api.html

#import seaborn as sb # not used? I think defaults change matplotlib outputs

# math and data science imports
    # https://numpy.org/doc/2.1/reference/index.html
    # https://pandas.pydata.org/docs/reference/index.html

#import numpy as np # not used?
import pandas as PNDS

# nltk imports
    # https://www.nltk.org/api/nltk.html

from nltk.tokenize import word_tokenize as NLTK_word_tokenize
from nltk.stem import LancasterStemmer as NLTK_lancaster_stemer
from nltk.corpus import stopwords as NLTK_stopwords
from nltk.stem.wordnet import WordNetLemmatizer as NLTK_wordnet_lemmatizer
from nltk.probability import FreqDist as NLTK_freq_dist

# sci-kit imports
    # https://scikit-learn.org/stable/api/index.html

from sklearn.metrics import accuracy_score
#from sklearn.metrics import accuracy_score as SKLN_accuracy_score
from sklearn.metrics import classification_report as SKLN_classification_report
from sklearn.metrics import ConfusionMatrixDisplay as SKLN_confusion_matrix_display
from sklearn.model_selection import train_test_split as SKLN_train_test_split
from sklearn.linear_model import LogisticRegression as SKLN_logistic_regression
from sklearn.ensemble import RandomForestClassifier as SKLN_random_forest_classifier
from sklearn.feature_extraction.text import TfidfVectorizer as SKLN_tfidf_vectorizer
from sklearn.tree import DecisionTreeClassifier as SKLN_decision_tree_classifier

In [None]:
# This data is provided by Kaggle at https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset
WARN_OFF = False # Turn off after finished debugging
if( WARN_OFF ):
    WARN.filterwarnings( 'ignore' )

ENCODING = 'latin1'
REL_PATH = '../data/'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
MAX_COLS = 20
MAX_ROWS = 3_000_000
HEAD_SIZE = 25

PNDS.set_option( 'display.max_columns', MAX_COLS )
PNDS.set_option( 'display.max_rows', MAX_ROWS )
training_data = PNDS.read_csv( REL_PATH + TRAIN_FILE, encoding=ENCODING );
test_data = PNDS.read_csv( REL_PATH + TEST_FILE, encoding=ENCODING );

In [None]:
# combine the training and teseting data
dframe = PNDS.concat( [ training_data, test_data ] )
#DEBUG print( f"{type(dframe)=}, {len(dframe)=}" )

In [None]:
# Show the top rows of the data
dframe.head( n=HEAD_SIZE )

In [None]:
# Provide general information on dataset
dframe.info( verbose=True )

In [None]:
# drop irrelevant data from dframe
dframe = dframe.drop( columns=[ 'textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)' ] )

In [None]:
# peek at current cols
column_list = dframe.columns.tolist()
print( column_list )

In [None]:
# Solve gaps in data by dropping NaNs on current Dataframe with counts pre- and post-process
nan_count_per_column = dframe.isna().sum()
print(nan_count_per_column)

In [None]:
dframe.dropna( inplace=True )
nan_count_per_column = dframe.isna().sum()
print(nan_count_per_column)

In [None]:
# peek at current cols
column_list = dframe.columns.tolist()
print( column_list )


In [None]:
# clean up text to deal with links, non-alphanumerics, URLs, whitespace, etc.
def alphanumericize( txt: str ) -> str:
    txt = str( txt )
    try: 
        pattern_replacement_pairs = {
            r'<.*?>': '',
            r'[^a-zA-Z0-9\s]': '',
            #r'http?://\S+': ' ', # not sure if any value in SA
            #r'https?://\S+': ' ',
            r'\s+': ' '
        }
        for pattern, replacement in pattern_replacement_pairs.items():
            txt = REGEX.sub( pattern, replacement, txt.strip() )
            #DEBUG print( f"{type(txt)=} {txt=}" )
        return txt
    except Exception as e:
        print( f"Error alphanumericizing: {e}" )
        return ""
    return 
dframe[ 'alphanumeric' ] = dframe[ 'text' ].apply( alphanumericize )

In [None]:
# tokenize the sentences so they can be processed further in the application
def tokenize( txt:str ) -> list:
    try:
        return NLTK_word_tokenize( str( txt ) )
    except Exception as e:
        print( f"Error tokenizing: {e}" )
        return ""
dframe[ 'tokens' ] = dframe[ 'text' ].apply( tokenize )

In [None]:
# Take a look at the results of tokenization
print( dframe[ 'tokens' ] )

#NB currently contains , '*', fragment of URL, breaks up contractions (I've, I'd, couldn't, etc.), shows NaNs

In [None]:
# convert to lowercase and remove redundant space around tokens
def normalize( txt: str ) -> str:
    txt = str( txt )
    try:
        txt = REGEX.sub(r'[^\w\s]', '', txt.lower() )
        txt = REGEX.sub(r'\s+', ' ', txt).strip()
    except Exception as e:
        print( f"Error normalizing: {e}" )
        return ""
    return txt 
dframe['normalized_text'] = dframe['text'].apply( normalize )

In [None]:
# Remove stopwords provided by the nltk corpus
def remove_stopwords( text: str ) -> str:
    txt = str( txt )
    try:
        words = text.split()      
        filtered_words = [ word for word in words if word.lower() not in NLTK_stopwords.words( 'english' ) ]
        filtered_text = ' '.join( filtered_words )
    except Exception as e:
        print( f"Error removing stopwords: {e}" )
        return ""
    return filtered_text
dframe[ 'text_without_stopwords' ] = dframe[ 'text' ].apply( remove_stopwords )

In [None]:
# provide memory usage of columns
dframe.memory_usage()

In [None]:
 # Look at bar chart see counts as well as relative size
dframe[ 'sentiment' ].value_counts().plot( kind='bar' )

In [None]:
# Look at a pie graph to get relative size of three polarities
dframe[ 'sentiment' ].value_counts().plot( kind='pie' );

In [None]:
# Examine the tally of neutral, positive, and negative sentiments
dframe[ 'sentiment' ].value_counts()

In [None]:
#
dframe[ 'sentiment_code' ] = dframe[ 'sentiment' ].astype( 'category' ).cat.codes
#sentiment_distribution = dframe[ 'sentiment_code '].value_counts()
#sentiment_distribution.plot( kind='bar' )
MPLTLIB_pyplt.show()

In [None]:
# inspect stopwords and punctuation here

In [None]:
# use lancaster stemmer
stuff_to_be_removed = list( NLTK_stopwords.words( 'english' ) ) + list( STR_punctuation )
stemmer = NLTK_lancaster_stemer()
text_list = dframe[ 'text' ].tolist()
print( len( text_list ) )
#for each_txt
print( text_list[ 1 ] )

In [None]:
# Show word frequency data
word_frq = NLTK_freq_dist( NLTK_word_tokenize( ' '.join(dframe[ 'sentiment' ] ) ) )
MPLTLIB_pyplt.figure( figsize=( 10, 6 ) )
word_frq.plot( 20, cumulative=False )
MPLTLIB_pyplt.title( 'Word Frequency Distribution' )
MPLTLIB_pyplt.xlabel( 'Word')
MPLTLIB_pyplt.ylabel( 'Frequency' )
MPLTLIB_pyplt.show()

In [None]:
# Why another DataFrame?
final_corpus = dframe[ 'text' ].astype( str ).tolist()
data_eda = PNDS.DataFrame()
data_eda[ 'text' ] = final_corpus
data_eda[ 'sentiment' ] = dframe[ 'sentiment' ].values
data_eda.head()

In [None]:
# Why is this data even used? Should I drop this?
dframe[ 'Time of Tweet' ] = dframe[ 'Time of Tweet' ].astype( 'category' ).cat.codes
dframe[ 'Country' ] = dframe[ 'Country' ].astype( 'category' ).cat.codes
dframe[ 'Age of User' ]=dframe[ 'Age of User' ].replace( {'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80} )

In [None]:
# This seems redundant
def wp( text ):
    text = REGEX.sub( 'https?://\S+|www\.\S+', '', text )
    text = REGEX.sub( '<.*?>+', '', text )
    text = REGEX.sub( '[%s]' % REGEX.escape( string.PUNCT ), '', text)
    text = REGEX.sub( '\n', '', text )
    text = REGEX.sub( '\w*\d\w*', '', text )
    return text
dframe[ 'selected_text' ] = dframe[ 'selected_text' ].apply( wp )

In [None]:
#
X = dframe[ 'selected_text' ]
Y = dframe[ 'sentiment' ]
X_train, X_test, Y_train, Y_test = SKLN_train_test_split( X, Y, test_size=0.2, random_state=42 )

In [None]:
# 
vectorizer = SKLN_tfidf_vectorizer()
XV_train = vectorizer.fit_transform( X_train )
XV_test = vectorizer.transform( X_test )

In [None]:
#
score_baseline = dframe[ 'sentiment' ].value_counts( normalize=True ).max()
score_baseline

In [None]:
#
lr = SKLN_logistic_regression( n_jobs=-1 )
lr.fit( XV_train,Y_train )

In [None]:
#
pred_lr = lr.predict( XV_test )

In [None]:
#
score_lr = SKLN_accuracy_score( Y_test, pred_lr )
print( score_lr )

In [None]:
# not defined; prob sklern
print( SKLN_classification_report( Y_test, pred_lr ) )

In [None]:
# 
SKLN_confusion_matrix_display.from_predictions( Y_test, pred_lr )

In [None]:
# 
dt = SKLN_decision_tree_classifier()
dt.fit( XV_train, Y_train )

In [None]:
# 
pred_dt = dt.predict(XV_test)

In [None]:
# 
score_dt = dt.score( XV_test, Y_test )
score_dt

In [None]:
#
print( SKLN_classification_report( Y_test, pred_dt ) )

In [None]:
#
from sklearn.metrics import ConfusionMatrixDisplay


ConfusionMatrixDisplay.from_predictions( Y_test, pred_dt )

In [None]:
#
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier( random_state=0 )
rfc.fit( XV_train, Y_train )

In [None]:
# 
pred_rfc = rfc.predict( XV_test )

In [None]:
# 
score_rfc = rfc.score( XV_test, Y_test )
score_rfc

In [None]:
#
print( SKLN_classification_report( Y_test, pred_rfc) )

In [None]:
# 
SKLN_confusion_matrix_display.from_predictions( Y_test, pred_rfc )

In [None]:
#
print( f'Baseline model = {score_baseline}\n',
        'Logistic regression = {score_lr}\n',
        'Decision Tree Classification = {score_dt}\n',
        'Random Forest Classifier = {score_rfc}' )

In [None]:
#
def label_sentiment(n):
    if n==0:
        return "negative sentiment"
    elif n==1:
        return "neutral sentiment"
    elif n==2:
        return "positive sentiment"
    else:
        assert n > 3, "illegal sentiment classification"
    
def test_item( news ):
    testing_news = { 'text': [ news ] }
    new_def_test = PNDS.DataFrame( testing_news )
    new_def_test[ 'text' ] = new_def_test[ 'text' ].apply( wp ) 
    new_x_test = new_def_test[ 'text' ]
    new_xv_test = vectorizer.transform( new_x_test )
    pred_lr = lr.predict( new_xv_test )
    pred_dt = dt.predict( new_xv_test ) # unused
    pred_rfc = rfc.predict( new_xv_test ) # unused

    return print( ( label_sentiment( pred_lr[1] ) ) )

In [None]:
#
def to_upper( txt ):
    return txt.upper()

def test_item( news ):
    testing_news = { "text": [ news ] }
    new_def_test = PNDS.DataFrame( testing_news )
    new_def_test[ 'text' ] = new_def_test[ 'text' ].apply( to_upper ) 
    new_x_test = new_def_test[ 'text' ]
    new_xv_test = vectorizer.transform( new_x_test )
    return new_xv_test

test_txt = "I am Sad"
test_item( test_txt )