In [1]:
import pandas as pd
import numpy as np
import plotly
import re
import math
import textblob
from textblob import TextBlob
from textblob import Blobber
from textblob.wordnet import Synset

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

# To do: 
- Separate by "or" and ";" also! - Done!
- *Polarity check before meaning check because of stop words processing should be better 'v' (kinda done)*

#### Reading the pre-cleaned data

In [4]:
flower_data = pd.read_csv("../Data/General/flower_cleaned.csv", error_bad_lines=False, delimiter=";")

In [5]:
flower_data.sample(10)

Unnamed: 0,Flower,Meaning
28,Azalea,take care
210,Mignonette,worth
284,Rose [yellow],extreme betrayal
243,Primrose,eternal love
112,Cypress,death
84,Carnation [solid color],affirmative
236,Peony,bravery (in japan)
250,Poppy [red],remembrance
147,Fungus,resilience
299,Rose [thornless],love at first sight


---
## - NLP towards network graph building

---
#### Using AFINN DICTIONARY for sentiment analysis

In [6]:
path = "../External/afinn-master/afinn/data/AFINN-111.txt"
word_data_afinn = pd.read_csv(path, delimiter="\t", header=None, names=["Word", "Score"])

In [7]:
word_data_afinn.sample(n = 10)

Unnamed: 0,Word,Score
433,combat,-1
1969,sceptical,-2
1303,infuriates,-2
2264,traumatic,-3
1092,grace,1
313,brave,2
1601,nifty,2
2419,welcome,2
2094,spiritless,-2
629,derision,-2


In [8]:
def afinn_score(sentence):
    split_sentence = sentence.split(" ")
    total_score = word_data_afinn.Score[word_data_afinn.Word.isin(split_sentence)].sum()
    return total_score

In [9]:
#flower_data['AfinnScore'] = flower_data.Meaning.apply(afinn_score)

In [10]:
flower_data.sample(n=10)

Unnamed: 0,Flower,Meaning
11,Anemone,sickness (negative)
34,Baby's breath,innocence
192,Lily [orange],desire
216,Narcissus,selfishness
141,Fennel,strength or
59,Camellia japonica,unpretending excellence
288,Rose [light pink],passion
23,Asphodel,my regrets follow you to the grave
354,Willow (creeping),love forsaken
76,Carnation [yellow],disappointment


---
#### Using TEXTBLOB for semantic meaning comparison

In [11]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")
wiki.tags

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

In [12]:
wiki.sentiment
TextBlob("my regrets follow you to the grave").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

Sentiment(polarity=-0.1, subjectivity=0.2)

In [13]:
a = wiki.words
a

WordList(['Python', 'is', 'a', 'high-level', 'general-purpose', 'programming', 'language'])

In [14]:
# Programming
a[5].lemmatize('v')

'program'

Get the highest comparison score for each synsets of a word
Limit to nouns words

In [15]:
textblob.Word("like").definitions
textblob.Word("Love").definitions

neverendsyn = textblob.Word("like").synsets
lovesyn = textblob.Word("Love").synsets

print("Similarity:", lovesyn[6].path_similarity(neverendsyn[0]))

['a similar kind',
 'a kind of person',
 'prefer or wish to do something',
 'find enjoyable or agreeable',
 'be fond of',
 'feel about or towards; consider, evaluate, or regard',
 'want to have',
 'resembling or similar; having the same or some of the same characteristics; often used in combination',
 'equal in amount or value',
 'having the same or similar characteristics',
 'conforming in every respect']

['a strong positive emotion of regard and affection',
 'any object of warm affection or devotion; ',
 'a beloved person; used as terms of endearment',
 'a deep feeling of sexual desire and attraction',
 'a score of zero in tennis or squash',
 'sexual activities (often including sexual intercourse) between two people',
 'have a great affection or liking for',
 'get pleasure from',
 'be enamored or in love with',
 'have sexual intercourse with']

Similarity: 0.08333333333333333


In [16]:
textblob.Word("thriftiness").definitions
textblob.Word("economy").definitions

['frugality in the expenditure of money or resources']

['the system of production and distribution and consumption',
 'the efficient use of resources',
 'frugality in the expenditure of money or resources',
 'an act of economizing; reduction in cost']

In [17]:
stop_words = set(stopwords.words('english'))

In [18]:
def clean_stopwords(sentence, stopwords):
    clean_sentence = []
    for word in sentence.split(" "):
        if word not in stopwords:
            clean_sentence.append(word)
    return " ".join(clean_sentence)

In [19]:
clean_stopwords("my regrets follow you to the grave", stop_words)
ablob = TextBlob("Love between two woman.")
ablob.words

'regrets follow grave'

WordList(['Love', 'between', 'two', 'woman'])

In [20]:
# Returns the highest similarity score between two sentences (with 1 being very similar, 0 being different).
def compare_meaning_sentence(sentence1, sentence2, stopwords):
    blob1 = TextBlob(clean_stopwords(sentence1, stopwords))
    blob2 = TextBlob(clean_stopwords(sentence2, stopwords))
    nb_words = len(blob1.words) + len(blob2.words)
    
    similarity_score = 0
    for word1 in blob1.words:
        for word2 in blob2.words:
            similarity_score += compare_meaning_word(word1, word2)
    return similarity_score / nb_words
                             
# Returns the highest similarity score between two words (with 1 being very similar, 0 being different).                             
def compare_meaning_word(word1, word2):
    highest_similarity_score = 0
    
    for synset1 in word1.synsets:
        for synset2 in word2.synsets:
            similarity_score = synset1.path_similarity(synset2)
            
            if(similarity_score != None and similarity_score > highest_similarity_score):
                #print(synset1, synset2, similarity_score)
                highest_similarity_score = similarity_score
                
    return highest_similarity_score



In [21]:
# Builds the edge network connection with respect to sentences similarities.
# The similarity_threshold refers to the lower limit of similarity score that is considered similar.
def index_flower_similarity(similarity_threshold, stopwords = stop_words):
    similarities = []
    
    for i in range(0, len(flower_data)):
        similar_indexes = []
        for j in range(i+1, len(flower_data)):
            #print("indexes:", i, j)
            score = compare_meaning_sentence(flower_data['Meaning'].iloc[i], flower_data['Meaning'].iloc[j], stop_words)
            
            if score > similarity_threshold:
                similar_indexes.append(j)
                
        similarities.append(similar_indexes)
    return similarities

#### Adding to the csv the similarity nodes with a threshold of 0.4

In [27]:
#network_links = index_flower_similarity(0.4)
flower_data['SimilarIndex40'] = pd.Series(network_links)
flower_data.to_csv("../Data/General/flower_cleaned.csv", sep=";", index = False)

In [23]:
flower_data.head(10)

Unnamed: 0,Flower,Meaning
0,Acacia,secret love
1,Acanthus,art
2,Aconite,misanthropy
3,Agrimony,thankfulness
4,Aloe,grief
5,Almond,promise
6,Amaranth (Globe),immortal love
7,Amaranth,immortality
8,Amaryllis,pride
9,Ambrosia,love is reciprocated


#### Adding to the csv the similarity nodes with a threshold of 0.3

In [24]:
network_links = index_flower_similarity(0.3)
flower_data['SimilarIndex30'] = pd.Series(network_links)

In [25]:
flower_data.to_csv("../Data/General/flower_cleaned.csv", sep=";", index = False)

#### Adding to the csv the similarity nodes with a threshold of 0.2

In [26]:
network_links = index_flower_similarity(0.2)
flower_data['SimilarIndex20'] = pd.Series(network_links)
flower_data.to_csv("../Data/General/flower_cleaned.csv", sep=";", index = False)