In [2]:
import pandas as pd
import numpy as np
import plotly
import re
import math
import textblob
from textblob import TextBlob
from textblob import Blobber
from textblob.wordnet import Synset

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Julie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

# To do: 
- Separate by "or" and ";" also! - Done!
- *Polarity check before meaning check because of stop words processing should be better 'v' (kinda done)*

#### Reading the pre-cleaned data

In [5]:
flower_data = pd.read_csv("../Data/General/flower_cleaned.csv", error_bad_lines=False, delimiter=";")

In [6]:
flower_data.sample(10)

Unnamed: 0,Flower,Meaning
158,Lily [white],purity
370,Zinnia,loyalty
72,Coreopsis,always cheerful
179,Mullein,good-nature
226,Sunflower,loyalty
69,China aster,love of variety
363,Sweet Pea,goodbye
218,Snowdrop,consolation
155,Lavender,distrust
287,Aster tataricus,remembrance


---
## - NLP towards network graph building

---
#### Using AFINN DICTIONARY for sentiment analysis

In [8]:
path = "../External/afinn-master/afinn/data/AFINN-111.txt"
word_data_afinn = pd.read_csv(path, delimiter="\t", header=None, names=["Word", "Score"])

In [9]:
word_data_afinn.sample(n = 10)

Unnamed: 0,Word,Score
2124,stolen,-2
1691,perfectly,3
726,disputing,-2
737,disruptive,-2
1502,matters,1
1489,made-up,-1
1409,lagging,-2
1191,homesick,-2
1747,praised,3
836,endorses,2


In [10]:
def afinn_score(sentence):
    if type(sentence) != float:
        split_sentence = sentence.split(" ")
        total_score = word_data_afinn.Score[word_data_afinn.Word.isin(split_sentence)].sum()
        return total_score

In [11]:
flower_data['AfinnScore'] = flower_data.Meaning.apply(afinn_score)

In [25]:
flower_data.sample(n=10)

Unnamed: 0,Flower,Meaning,AfinnScore
219,Snowdrop,hope,2.0
304,Cherry Blossom,transience of life,0.0
83,Crocus,lupercalia,0.0
31,Azalea,gratitude,0.0
50,Bird of paradise flower,good perspective,3.0
348,Pansy,caring,0.0
18,Apple blossom,preference,0.0
124,Gardenia,sweet love,5.0
46,Bells of Ireland,luck,3.0
27,Aster,trusting,0.0


---
#### Using TEXTBLOB for semantic meaning comparison

In [26]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")
wiki.tags

[('Python', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('high-level', 'JJ'),
 ('general-purpose', 'JJ'),
 ('programming', 'NN'),
 ('language', 'NN')]

In [27]:
wiki.sentiment
TextBlob("my regrets follow you to the grave").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

Sentiment(polarity=-0.1, subjectivity=0.2)

In [28]:
a = wiki.words
a

WordList(['Python', 'is', 'a', 'high-level', 'general-purpose', 'programming', 'language'])

In [29]:
# Programming
a[5].lemmatize('v')

'program'

Get the highest comparison score for each synsets of a word
Limit to nouns words

In [30]:
textblob.Word("like").definitions
textblob.Word("Love").definitions

neverendsyn = textblob.Word("like").synsets
lovesyn = textblob.Word("Love").synsets

print("Similarity:", lovesyn[6].path_similarity(neverendsyn[0]))

['a similar kind',
 'a kind of person',
 'prefer or wish to do something',
 'find enjoyable or agreeable',
 'be fond of',
 'feel about or towards; consider, evaluate, or regard',
 'want to have',
 'resembling or similar; having the same or some of the same characteristics; often used in combination',
 'equal in amount or value',
 'having the same or similar characteristics',
 'conforming in every respect']

['a strong positive emotion of regard and affection',
 'any object of warm affection or devotion; ',
 'a beloved person; used as terms of endearment',
 'a deep feeling of sexual desire and attraction',
 'a score of zero in tennis or squash',
 'sexual activities (often including sexual intercourse) between two people',
 'have a great affection or liking for',
 'get pleasure from',
 'be enamored or in love with',
 'have sexual intercourse with']

Similarity: 0.08333333333333333


In [31]:
textblob.Word("thriftiness").definitions
textblob.Word("economy").definitions

['frugality in the expenditure of money or resources']

['the system of production and distribution and consumption',
 'the efficient use of resources',
 'frugality in the expenditure of money or resources',
 'an act of economizing; reduction in cost']

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
def clean_stopwords(sentence, stopwords):
    clean_sentence = []
    for word in sentence.split(" "):
        if word not in stopwords:
            clean_sentence.append(word)
    return " ".join(clean_sentence)

In [14]:
clean_stopwords("my regrets follow you to the grave", stop_words)
ablob = TextBlob("Love between two woman.")
ablob.words

'regrets follow grave'

WordList(['Love', 'between', 'two', 'woman'])

In [15]:
# Returns the highest similarity score between two sentences (with 1 being very similar, 0 being different).
def compare_meaning_sentence(sentence1, sentence2, stopwords):
    blob1 = TextBlob(clean_stopwords(sentence1, stopwords))
    blob2 = TextBlob(clean_stopwords(sentence2, stopwords))
    nb_words = len(blob1.words) + len(blob2.words)
    
    similarity_score = 0
    for word1 in blob1.words:
        for word2 in blob2.words:
            similarity_score += compare_meaning_word(word1, word2)
    return similarity_score / nb_words
                             
# Returns the highest similarity score between two words (with 1 being very similar, 0 being different).                             
def compare_meaning_word(word1, word2):
    highest_similarity_score = 0
    
    for synset1 in word1.synsets:
        for synset2 in word2.synsets:
            similarity_score = synset1.path_similarity(synset2)
            
            if(similarity_score != None and similarity_score > highest_similarity_score):
                #print(synset1, synset2, similarity_score)
                highest_similarity_score = similarity_score
                
    return highest_similarity_score



In [16]:
numbers = {}
numbers['a'] = 4
numbers['b'] = 2
numbers['c'] = 7
sorted(numbers, key = numbers.get, reverse=True)
#sorted(numbers.values())

['c', 'a', 'b']

In [37]:
# Builds the edge network connection with respect to sentences similarities.
# The similarity_threshold refers to the lower limit of similarity score that is considered similar.
def index_flower_similarity(similarity_threshold, stopwords = stop_words, link_limit = 6):
    similarities = []
    
    for i in range(0, len(flower_data)):
        similar_indexes = []
        similar_index_score = {}
        
        for j in range(i+1, len(flower_data)):
            #print("indexes:", i, j)
            score = compare_meaning_sentence(flower_data['Meaning'].iloc[i], flower_data['Meaning'].iloc[j], stop_words)
            
            if score > similarity_threshold:
                similar_indexes.append(j)
                similar_index_score[j] = score
                
        similar_index_score = sorted(similar_index_score, key = similar_index_score.get, reverse=True)
        del similar_index_score[link_limit:]
        
        similarities.append(similar_index_score)
        
    return similarities

#### Adding to the csv the similarity nodes with a threshold of 0.4

In [38]:
#network_links = index_flower_similarity(0.4)
flower_data['SimilarIndex40'] = pd.Series(network_links)
flower_data.to_csv("../Data/General/flower_cleaned.csv", sep=";", index = False)

NameError: name 'network_links' is not defined

In [None]:
flower_data.head(10)

#### Adding to the csv the similarity nodes with a threshold of 0.3

In [None]:
network_links = index_flower_similarity(0.3)
flower_data['SimilarIndex30'] = pd.Series(network_links)

In [None]:
flower_data.to_csv("../Data/General/flower_cleaned.csv", sep=";", index = False)

#### Adding to the csv the similarity nodes with a threshold of 0.2

In [None]:
network_links = index_flower_similarity(0.2)
flower_data['SimilarIndex20'] = pd.Series(network_links)
flower_data.to_csv("../Data/General/flower_cleaned.csv", sep=";", index = False)