In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")



In [2]:
from textblob import TextBlob
import pandas as pd

In [3]:
import re

In [8]:
def categorise_polarity(row):  
    if row > 0:
        tag = 'positive'
        return tag
    elif row == 0:
        tag = 'neutral'
        return tag
    else:
        tag = 'negative'
        return tag

In [52]:
class Text:
    def __init__(self, text: str):
        self.text = text

    def paragraphs(self):
        paragraphs = self.text.split("\n")
        return paragraphs # list
    
    def sentences(self):
        text_clean = re.sub("\n", " ", self.text) # self.paragraphs()
        # text_clean = re.sub("--", "", text_clean)
        text_clean = re.sub("\s+", " ", text_clean)
        doc = nlp(text_clean)
        sentences = doc.sents
        sent_list = list(sentences)
        sent_df = pd.DataFrame(zip(sent_list), columns = ["sentences"])
        sent_df = sent_df.astype(str)
        sent_df["sentence_id"] = sent_df.index + 1
        sent_df = sent_df[["sentence_id", "sentences"]]
        return sent_df # spacy object # list
    
    def get_sentiment(self, sent_df):
        sent_df["polarity"] = sent_df["sentences"].apply(lambda x: TextBlob(x).sentiment.polarity)
        sent_df["polarity_tag"] = sent_df["polarity"].apply(lambda row: categorise_polarity(row))
        sent_df["subjectivity"] = sent_df["sentences"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
        # Subjectivity quantifies the amount of personal opinion and factual information contained in the text. 
        # The higher subjectivity means that the text contains personal opinion rather than factual information
        return sent_df # list

    def get_ner(self, sent_df): # dataframe
        self.sent_df = sent_df
        sent_df["entities"] = sent_df.sentences.apply(lambda x: list(nlp(x).ents))
        # sent_df["entity_label"] = sent_df.entities.apply(lambda x: list(x.label_))
        return sent_df #dataframe


    def tokens(self, sent_df): #dataframe
        sent_df['tokens'] = sent_df.apply(lambda row: nlp(row['sentences']), axis=1)
        sent_df = sent_df.explode('tokens')
        # criar um novo DF com tokens, sentence_id e token_id
        # self.doc = doc
        # tokens = []
        # for token in doc:
        #     tokens.append(token.text)
        return sent_df # list 
        
# transformar em DF
# método o para indicar o sentimento
# NER
# emoções

In [53]:
string = """
I love Chicago.
it was fine.
There is a horrible bag in my closet from Gucci.
I hate Smith Jones.
"""

In [54]:
f = Text(string)
df= f.sentences()
df

Unnamed: 0,sentence_id,sentences
0,1,I love Chicago.
1,2,it was fine.
2,3,There is a horrible bag in my closet from Gucci.
3,4,I hate Smith Jones.


In [55]:
f.get_sentiment(df)

Unnamed: 0,sentence_id,sentences,polarity,polarity_tag,subjectivity
0,1,I love Chicago.,0.5,positive,0.6
1,2,it was fine.,0.416667,positive,0.5
2,3,There is a horrible bag in my closet from Gucci.,-1.0,negative,1.0
3,4,I hate Smith Jones.,-0.8,negative,0.9


In [47]:
f.get_ner(df)

Unnamed: 0,sentence_id,sentences,polarity,polarity_tag,entities
0,1,I love Chicago.,0.5,positive,[(Chicago)]
1,2,it was fine.,0.416667,positive,[]
2,3,There is a horrible bag in my closet from Gucci.,-1.0,negative,[(Gucci)]
3,4,I hate Smith Jones.,-0.8,negative,"[(Smith, Jones)]"


In [16]:
f.tokens(df)

Unnamed: 0,sentence_id,sentences,polarity,polarity_tag,entities,tokens
0,1,I love Chicago.,0.5,positive,[(Chicago)],
0,1,I love Chicago.,0.5,positive,[(Chicago)],I
0,1,I love Chicago.,0.5,positive,[(Chicago)],love
0,1,I love Chicago.,0.5,positive,[(Chicago)],Chicago
0,1,I love Chicago.,0.5,positive,[(Chicago)],.
1,2,it was terrible.,-1.0,negative,[],it
1,2,it was terrible.,-1.0,negative,[],was
1,2,it was terrible.,-1.0,negative,[],terrible
1,2,it was terrible.,-1.0,negative,[],.
2,3,There is a shitty bag in my closet from Gucci.,0.0,neutral,[(Gucci)],There
