# Sentiment Analysis

### 1) Classifier per Category

In [1]:
## Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

## Librerias para graficación
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK es una librería particular para PLN. Tiene muchas funcionalidades entre ellas stemming y lista de palabras de parada.
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import csv

stemmer = nltk.stem.SnowballStemmer('english') # Vamos a utlizar el Snowball Stemmer para realizar stemming (nos permite llevar las palabras a una forma estandar).
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Preprocessing of words
def processing_text(texto):
    processed_feature = re.sub(r'\W', ' ', str(texto))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)

    return processed_feature

In [3]:
# Define function to read and preprocess the reviews
def read_and_preprocess_reviews(folder_path):
    
    dictionary_training = {}
    dictionary_test = {}

    #Define stopwords
    stopwords_english = set(stopwords.words('english'))

    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category) #Category path
        if os.path.isdir(category_path):
            for sentiment in ["positive","negative"]:
                sentiment_path = os.path.join(category_path, sentiment + '.review') #Sentiment path
                if os.path.isfile(sentiment_path):
                    with open(sentiment_path, 'r',encoding='utf-8') as file:
                        review_text = file.read()
                        # Preprocess input data
                        preprocess = processing_text(review_text)
                        words = nltk.word_tokenize(preprocess)
                        string = ""
                        train_reviews = []
                        train_labels = []
                        
                        for word in words:
                            if word == "positive" or word == "negative":
                                train_labels.append(word)
                                train_reviews.append(string)
                                string = ""
                            else:
                                string += word + " "
                        
        # list of reviews per category with their respective class(posituive or negative)     
        dictionary_training[category] = (train_reviews,train_labels)

        test_reviews = []
        test_label = []
        #Test set
        sentiment_test_path = os.path.join(category_path, 'unlabeled.review') #Sentiment path test
        if os.path.isfile(sentiment_test_path):
            with open(sentiment_test_path, 'r',encoding='utf-8') as file:
                review_text = file.read()
                # Preprocess and remove stopwords
                preprocess = processing_text(review_text)
                words = nltk.word_tokenize(preprocess)
                for word in words:
                    if word == "positive" or word == "negative":
                        test_label.append(word)
                        test_reviews.append(string)
                        string = ""
                    else:
                        string += word + " "
                        
        dictionary_test[category] = (test_reviews,test_label)

    return dictionary_training,dictionary_test
                

data_folder = './Multi Domain Sentiment/processed_acl'
reviews_training_set = read_and_preprocess_reviews(data_folder)[0]
unlabeled_test_text = read_and_preprocess_reviews(data_folder)[1]

In [4]:
print(reviews_training_set["kitchen"][0][0])
#print(len(unlabeled_test_text))

right_after guess dog well fill to_work it_seems keep other_small work_ num working_very care barks cairn small_dog time and fairly sure_why is_supposed emits and_barks just_looks either_does such_a for_my either away emits and such sure it_either my they re_getting again it to_care the_spray care_if does_not a_persistent getting_sprayed just my_other sprays_as we re_not looks sprays if_they re are_just is_not my_yappy too_stubborn doesn t_work looks_away when dog_who barker barking they re we_fill fill_it not_sure work sprayed_i long it_does also seems also_doesn small cairn_from we sprayed again it_also yappy persistent_barker where not_such persistent why terriers isn t_working long it who if just_too very num _of time and_we re from_where right work_well he very_long it well_for supposed_to too when_we terriers_are we re after_but keep_my num other getting after stubborn yappy_cairn not the_time and as_many stubborn_to supposed where_the well_right fairly_well doesn work_fairly man

In [5]:
#Bag of words
vectorizer = CountVectorizer(max_features=2500, stop_words=stopwords.words('english'))
# Vocabulario y tambien transforme nuestro texto
for categories in reviews_training_set:
    
    reviews = reviews_training_set[categories][0]
    labels = reviews_training_set[categories][1]
    
    unlabeled_reviews = unlabeled_test_text[categories][0]
    unlabeled_labels = unlabeled_test_text[categories][1]

    texto_features = vectorizer.fit_transform(reviews).toarray()

    #Entrenar el modelo naive bayes
    nb = MultinomialNB()
    nb.fit(texto_features, labels)

    #Evaluar que tan buneo es el modelo
    predictions = nb.predict(unlabeled_reviews)

    # Ahora calculamos el score de accuracy enviando las predicciónes y los valores reales de polaridad.
    print(accuracy_score(unlabeled_labels, predictions))

ValueError: Expected 2D array, got 1D array instead:
array=['avid your horrible_book wasted use_it the_entire money i_lit i_read lit i_would relationship read a_ num reader_and reader suffering fire_one i_had year_old gotten horrible lit_this world don my one_star headache_the this_book mom was_horrible friend book_horrible star_i back avid_reader than_one life copy rate_it rate my_mom man book_was half on_fire and_then reading_this so lower i_could num _year than time half_of time_spent then book and_picked possible spent old_man up_after one horrible_if one_less part was entire less_copy to_rate my_life about_the your_money an_avid if the_relationship use a_headache fire lower_than reading a_friend picked purposes then_got waste_your after_my friend_i old man_and and_i world don t_waste book_on part_about copy_in book_back book_wasted have_i time_and the_world don better if_it star got mom_had read_half waste after about could_use had_gotten was_possible year it_lower relationship_the wasted_my wish wish_i boy purposes_this got_to the_time it_was back_so suffering_from spent_reading book_up less better_purposes headache possible_to money i_wish for_better it_suffering the_part gotten_it picked_this entire_time old_boy i_am the_ num boy_had num so_i label '
 'to_use shallow found he_castigates castigates_for items_which and_panders so play_for person_with was reviews offer_this perceptions_about usually for_dissenting when_purchasing smug world_this this_work offer no with_little tone_throughout a_person bashers_of portrayal_of use_the play review i_found i_like popularity world the_amazon self indulgence me it_smug i_offer the_family seriously like_to and_self indulgent questions_the shrink _bashers work_ books_especially disuades_me especially_for which_usually purchasing_books throughout_the family_therapist work distasteful self serving distasteful_to example alert_for reaches_for dissenting_perceptions review_that dissenting found_it book_was people selection_so higly_rated the_world usually_disuades and_reaches little throughout implausible amazon for_effect panders_to popularity_of bashers for_example when especially use very therapist_seems no_empathy the_book very_distasteful this_review amazon_reviews a_selection self serving_and reviews_when little_or smug_self serving panders purchasing he written_by self indulgent_written shrink especially_alert empathy_especially selection castigates self indulgent perceptions written people_he reaches seriously_questions family about_higly questions rated empathy portrayal about books his rated_items higly shallow_self indulgence alert tone person seems_implausible items effect_tone disuades book like to_me his_portrayal effect was_very the_popularity or_no effect_and the_ shrink me_from therapist implausible_and that_seriously this_play seems which the_people so_i example_his label '
 'avid your horrible_book wasted use_it the_entire money i_lit i_read lit i_would relationship read a_ num reader_and reader suffering fire_one i_had year_old gotten horrible lit_this world don my one_star headache_the this_book mom was_horrible friend book_horrible star_i back avid_reader than_one life copy rate_it rate my_mom man book_was half on_fire and_then reading_this so lower i_could num _year than time half_of time_spent then book and_picked possible spent old_man up_after one horrible_if one_less part was entire less_copy to_rate my_life about_the your_money an_avid if the_relationship use a_headache fire lower_than reading a_friend picked purposes then_got waste_your after_my friend_i old man_and and_i world don t_waste book_on part_about copy_in book_back book_wasted have_i time_and the_world don better if_it star got mom_had read_half waste after about could_use had_gotten was_possible year it_lower relationship_the wasted_my wish wish_i boy purposes_this got_to the_time it_was back_so suffering_from spent_reading book_up less better_purposes headache possible_to money i_wish for_better it_suffering the_part gotten_it picked_this entire_time old_boy i_am the_ num boy_had num so_i label '
 ...
 'favorable_reviews heard straight book pamphlet disappointed was reviews is_good read x _pages pages_with completely many_cartoons provide_excercises misled_by is_completely provide x does_not favorable excercises_to improve alignment too it_straight good_and simply posture_get posture_there better_alternatives huge it_simply in_information font improve_it i_was and_recommend for_ posture posture reviews_it great this_book pamphlet information_it much posture and_bad heard_of are_much completely_lacking successfully_defines lacking defines not_provide have_heard to_improve huge_font get what_is get_it being_misled good alternatives_i successfully better was_extremely alignment_and recommend bad_posture what practice excercises extremely_disappointed much_much it_successfully disappointed_by pages too_many alternatives great_reviews misled num _ x defines_what and_too num other cartoons has_ num with_huge read_posture not lacking_in have_read extremely book pamphlet_being recommend_that to_ practice posture_alignment other_favorable states_to i_have much_better states font_and posture_but information many good_posture simply_states practice _good of_great bad the_other reviews_for cartoons_it label '
 'helpful this_one substance_and pages_devoted ideas package something_you this_book person that_what for_one of_time amount_of time gets_ num restauranteur has_very effort read you_don to_entrepreneurship this_type num play_to business_ideas the_typical similar_books some which book_is otherwise relates_to idea otherwise_i have_not little_substance relates great time_or in_addition helpful_in what_may will_not require_a value devoted_to doing or_having of_business author_does addition many the_ideas masonry entrepreneurship_has for_ideas entrepreneurship ideas_listed the_case not_read typical_package one stars_because pages person_may a_tremendous title devoted tremendous putting_in that_require in_many book_gets where very_little play very deal not_require however_this few_pages may_be to_another listed are_numerous num _stars is_sparse of_work the_time be_play work for_example this a_great work_for idea_is package_of gets don t_mind because_the the_few business has_some this_relates addition_there book be_work deal_of what each_idea book_will state another_however cases great_deal having_a case_for where_you take_a not_the books mind_putting information_under enjoy you example this_book similar may stars the_author require type because mind for_many passion books_of which_is ideas masonry some_value many_of author value_for numerous pass_on putting not_helpful ideas_the or_work to_doing or_effort effort_which okay you_enjoy passion_where one_person a_passion little numerous_ideas take suggests_that listed_in is_not something pass the_title sparse_and however state_that another substance i_would ideas_that enjoy_or cases_not many_cases example this typical not sparse would_take suggests having work_the ideas masonry_restauranteur case type_otherwise few does_state the_information okay_for doing_something the_business title_suggests a_pass amount book_has restauranteur_for the_book don information read_similar tremendous_amount label '
 'see_other written_by objective_as columnist significant to_uncritically back_and this_book author s_congregation hallelujah bet_you documentaries _like bill_harry calls those_who already clear say_that movie didn t_know and_racist just_laziness enough a_member proclaimed work_but who_are movie_was himself friend significant_misrepresentations jew_or agenda focused_books agenda focused learn_history uncritically_accept it s_foolish and_you just movie_to reefer_madness let thompson_it book_or eternal_jew himself_now way_i ll moore factually was_written hunting _need challenged think_they like are_credulous care_you ll moore the_way already_a accept shout need_to great easterners_who you_didn image political well known eye opening be_honest typically challenged_fahrenheit member thinkers_understand truth revealing_work the_world objective as_objective followup_movie easterners friend_of point just_reefer produced now_calls _a foolish political_books pro clinton and_films many columnist_and was_produced have_ecstatically world laziness_these ecstatically who whether_intentional jew history_through presented understand_that congregation seen_moore middle_easterners the_representations know either thinkers see it s_just middle the_shallow point_is misrepresentations_if strongly shouldn or_reefer strongly_pro clinton if_you re calls_ satire clear_thinkers or_just image_he accept_the say triumph_of an_eye opening satire _by you re_already factually_challenged outraged know_that about_as films outraged_at books congregation_you to_think credulous these_dime dozen like_triumph films_like foolish_to fahrenheit work_that a_work hunting _has eye opening_truth revealing intentional he_presented the_movie many_middle representations that_moore ecstatically_proclaimed and_culture fahrenheit_ understand books_typically work reefer_madness proclaimed_this harry_thompson either_this madness harry they learn fob book seen books_and way was that_ hunting has_given of_their inhale several_reviewers ll_bet people_this laziness in_either through shallow credulous_enough followup you world_great dime dozen_political to_go fob_friend whether shouldn t_inhale several need shout_ hallelujah contain won honest_people truth revealing uncritically satire reviewers author it s_about you re given_the their_people reviewers_have documentaries through_agenda focused that_many inhale several other by_well known madness let s_be but_clear the_point hunting contain_significant representations_in culture_the reefer is_whether and_see go_back great_truths bet shallow_stereotypical madness let hallelujah _and stereotypical_and the_author written truths_but you_shouldn member_of if the_followup moore s_movie racist moore_himself you ll_shout truths he other_ documentaries culture stereotypical ll go enough_to won t_care presented_of the_factually book_was misrepresentations like_ hunting are_outraged now didn have_seen of_bill typically_contain pro clinton_columnist you ll and_say about produced_by back and_movie bill people book_and intentional_or the_eternal history triumph you_won well known_fob honest who_have people_and care a_strongly thompson given think they_learn eternal racist_image their movie_are dime dozen label '].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.