In [9]:
import pandas as pd
import numpy as np
import plotly
import re
import math
import textblob
from textblob import TextBlob
from textblob import Blobber
from textblob.wordnet import Synset

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

#### Reading the pre-cleaned data

In [5]:
flower_data = pd.read_csv("../Data/General/flower_cleaned.csv", error_bad_lines=False, delimiter=";")

In [6]:
flower_data.sample(10)

Unnamed: 0,Flower,Meaning,AfinnScore,SimilarIndex40
150,Geranium,determination,0,[]
214,Oats,music,0,[]
252,Rainflower,i will never forget you,-1,[]
98,Clove,undying love,3,"[274, 311]"
216,Orchid,refined beauty,0,[]
290,Rose [thornless],love at first sight,3,"[311, 313, 334, 341, 351]"
348,Wheat,wealth and prosperity,3,[]
87,Cherry blossom,gentleness,0,[]
141,Fungus,resilience,0,[]
245,Poppy [white],dreams,1,[340]


---
## - NLP towards network graph building

---
#### Using AFINN DICTIONARY for sentiment analysis

In [9]:
path = "../Externaafinn-master/afinn/data/AFINN-111.txt"
word_data_afinn = pd.read_csv(path, delimiter="\t", header=None, names=["Word", "Score"])

In [6]:
word_data_afinn.sample(n = 10)

Unnamed: 0,Word,Score
482,contagion,-2
1369,itchy,-2
1621,obsessed,2
1503,mature,2
1270,inaction,-2
1851,refusing,-2
1412,landmark,2
2302,underestimates,-1
1013,forgiving,1
1239,ignored,-2


In [7]:
def afinn_score(sentence):
    split_sentence = sentence.split(" ")
    total_score = word_data_afinn.Score[word_data_afinn.Word.isin(split_sentence)].sum()
    return total_score

In [8]:
#flower_data['AfinnScore'] = flower_data.Meaning.apply(afinn_score)

In [20]:
flower_data.sample(n=10)

Unnamed: 0,Flower,Meaning,AfinnScore
317,Tulip [pink],joyful occasions,3
91,Chestnut,chastity,0
287,Rose [red and white together],united,1
158,Heather [white],protection,0
259,Rose [white],virtue,0
0,Acacia,secret love,3
21,Arum,faith,1
309,Thyme,thriftiness,0
15,Anthurium,hospitality,0
148,Gardenia,good luck,6


---
#### Using TEXTBLOB for semantic meaning comparison

In [141]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")
wiki.tags

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

In [139]:
wiki.sentiment
TextBlob("my regrets follow you to the grave").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

Sentiment(polarity=-0.1, subjectivity=0.2)

In [14]:
a = wiki.words
a

WordList(['Python', 'is', 'a', 'high-level', 'general-purpose', 'programming', 'language'])

In [135]:
# Programming
a[5].lemmatize('v')

'program'

Get the highest comparison score for each synsets of a word
Limit to nouns words

In [133]:
textblob.Word("like").definitions
textblob.Word("Love").definitions

neverendsyn = textblob.Word("like").synsets
lovesyn = textblob.Word("Love").synsets

print("Similarity:", lovesyn[6].path_similarity(neverendsyn[0]))

['a similar kind',
 'a kind of person',
 'prefer or wish to do something',
 'find enjoyable or agreeable',
 'be fond of',
 'feel about or towards; consider, evaluate, or regard',
 'want to have',
 'resembling or similar; having the same or some of the same characteristics; often used in combination',
 'equal in amount or value',
 'having the same or similar characteristics',
 'conforming in every respect']

['a strong positive emotion of regard and affection',
 'any object of warm affection or devotion; ',
 'a beloved person; used as terms of endearment',
 'a deep feeling of sexual desire and attraction',
 'a score of zero in tennis or squash',
 'sexual activities (often including sexual intercourse) between two people',
 'have a great affection or liking for',
 'get pleasure from',
 'be enamored or in love with',
 'have sexual intercourse with']

Similarity: 0.08333333333333333


In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
def clean_stopwords(sentence, stopwords):
    clean_sentence = []
    for word in sentence.split(" "):
        if word not in stopwords:
            clean_sentence.append(word)
    return " ".join(clean_sentence)

In [13]:
clean_stopwords("my regrets follow you to the grave", stop_words)
ablob = TextBlob("Love between two woman.")
ablob.words

WordList(['Love', 'between', 'two', 'woman'])

In [14]:
# Returns the highest similarity score between two sentences (with 1 being very similar, 0 being different).
def compare_meaning_sentence(sentence1, sentence2, stopwords):
    blob1 = TextBlob(clean_stopwords(sentence1, stopwords))
    blob2 = TextBlob(clean_stopwords(sentence2, stopwords))
    nb_words = len(blob1.words) + len(blob2.words)
    
    similarity_score = 0
    for word1 in blob1.words:
        for word2 in blob2.words:
            similarity_score += compare_meaning_word(word1, word2)
    return similarity_score / nb_words
                             
# Returns the highest similarity score between two words (with 1 being very similar, 0 being different).                             
def compare_meaning_word(word1, word2):
    highest_similarity_score = 0
    
    for synset1 in word1.synsets:
        for synset2 in word2.synsets:
            similarity_score = synset1.path_similarity(synset2)
            
            if(similarity_score != None and similarity_score > highest_similarity_score):
                #print(synset1, synset2, similarity_score)
                highest_similarity_score = similarity_score
                
    return highest_similarity_score

In [15]:
# Builds the edge network connection with respect to sentences similarities.
# The similarity_threshold refers to the lower limit of similarity score that is considered similar.
def index_flower_similarity(similarity_threshold, stopwords = stop_words):
    similarities = []
    
    for i in range(0, len(flower_data)):
        similar_indexes = []
        for j in range(i+1, len(flower_data)):
            #print("indexes:", i, j)
            score = compare_meaning_sentence(flower_data['Meaning'].iloc[i], flower_data['Meaning'].iloc[j], stop_words)
            
            if score > similarity_threshold:
                similar_indexes.append(j)
                
        similarities.append(similar_indexes)
    return similarities

#### Adding to the csv the similarity nodes with a threshold of 0.4

In [124]:
#network_links = index_flower_similarity(0.4)
#flower_data['SimilarIndex40'] = pd.Series(network_links)
#flower_data.to_csv("General/flower_cleaned.csv", sep=";", index = False)

[[145, 181],
 [],
 [],
 [],
 [112, 201, 263, 293],
 [100, 221, 328],
 [20, 62, 147, 181, 250],
 [],
 [],
 [13,
  20,
  39,
  44,
  62,
  67,
  94,
  98,
  106,
  115,
  120,
  138,
  145,
  147,
  154,
  163,
  172,
  181,
  200,
  205,
  206,
  208,
  236,
  250,
  253,
  256,
  274,
  286,
  290,
  311,
  313,
  334,
  341,
  344,
  351],
 [344],
 [],
 [],
 [98, 274, 311],
 [],
 [],
 [],
 [],
 [],
 [],
 [36, 62, 95, 154, 181, 256, 286, 290, 341],
 [122, 192, 331],
 [34, 66, 119, 121, 152, 184, 196, 257, 260],
 [82, 350],
 [62, 145, 147, 181, 205, 256, 286, 290],
 [],
 [62, 181, 250],
 [186, 192, 278, 284, 328, 331],
 [116, 200, 256, 286, 290, 314, 315],
 [],
 [],
 [59, 60, 277, 305],
 [63, 94, 106, 187, 199, 279, 285, 312],
 [],
 [66, 119, 121, 184, 196, 257, 260],
 [257],
 [],
 [91],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [168],
 [89, 126, 341],
 [],
 [237],
 [190, 262],
 [],
 [99],
 [248],
 [],
 [],
 [60, 277, 305],
 [277, 305],
 [62, 92, 181, 199, 256, 286, 290],


# To do: 
- Polarity check before meaning check because of stop words processing should be better 'v'