In [None]:
# ! pip install spacy

In [None]:
# ! python -m spacy download en_core_web_sm

## spaCy's part-of-speech and dependency tags

The list is:

* ADJ: adjective
* ADP: adposition
* ADV: adverb
* AUX: auxiliary verb
* CONJ: coordinating conjunction
* DET: determiner
* INTJ: interjection
* NOUN: noun
* NUM: numeral
* PART: particle
* PRON: pronoun
* PROPN: proper noun
* PUNCT: punctuation
* SCONJ: subordinating conjunction
* SYM: symbol
* VERB: verb
* X: other

https://spacy.io/api/annotation

The list of other attributes for tokens can be found at https://spacy.io/api/token

## Dependency Tokens

spaCy's dependency tag scheme is based upon the ClearNLP project; the meanings of the tags can be found at https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md:

* ACL: Clausal modifier of noun
* ACOMP: Adjectival complement
* ADVCL: Adverbial clause modifier
* ADVMOD: Adverbial modifier
* AGENT: Agent
* AMOD: Adjectival modifier
* APPOS: Appositional modifier
* ATTR: Attribute
* AUX: Auxiliary
* AUXPASS: Auxiliary (passive)
* CASE: Case marker
* CC: Coordinating conjunction
* CCOMP: Clausal complement
* COMPOUND: Compound modifier
* CONJ: Conjunct
* CSUBJ: Clausal subject
* CSUBJPASS: Clausal subject (passive)
* DATIVE: Dative
* DEP: Unclassified dependent
* DET: Determiner
* DOBJ: Direct Object
* EXPL: Expletive
* INTJ: Interjection
* MARK: Marker
* META: Meta modifier
* NEG: Negation modifier
* NOUNMOD: Modifier of nominal
* NPMOD: Noun phrase as adverbial modifier
* NSUBJ: Nominal subject
* NSUBJPASS: Nominal subject (passive)
* NUMMOD: Number modifier
* OPRD: Object predicate
* PARATAXIS: Parataxis
* PCOMP: Complement of preposition
* POBJ: Object of preposition
* POSS: Possession modifier
* PRECONJ: Pre-correlative conjunction
* PREDET: Pre-determiner
* PREP: Prepositional modifier
* PRT: Particle
* PUNCT: Punctuation
* QUANTMOD: Modifier of quantifier
* RELCL: Relative clause modifier
* ROOT: Root
* XCOMP: Open clausal complement

https://spacy.io/models/en#en_core_web_sm

In [1]:
import spacy

# Load the English tokenizer, tagger, parser, NER and word vectors
# • The nlp object is used to create documents, access linguistic annotations and different nlp properties
# • en_core_web_sm is an statistical model of the English class with the model weights loaded in, and built-in pipeline, so
#   spaCy can predict part-of-speech tags, dependency labels and named entities 
# • noun_chunks and dependency tagging requires the dependency parser and tagger, respectfully
nlp = spacy.load("en_core_web_sm")
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1f4a80472e8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1f4a97a3ee8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1f4a97a3f48>)]

## Named Entity Recognition 

https://spacy.io/api/annotation#named-entities

| TYPE | DESCRIPTION |
|-----|-----|
| PERSON | People, including fictional  |
| NORP | Nationalities or religious or political groups |
| FAC |	Buildings, airports, highways, bridges, etc.  |
| ORG |	Companies, agencies, institutions, etc. |
| GPE |	Countries, cities, states |
| LOC |	Non-GPE locations, mountain ranges, bodies of water |
| PRODUCT |	Objects, vehicles, foods, etc. (Not services.) |
| EVENT | Named hurricanes,  battles, wars, sports events, etc. |
| WORK_OF_ART |	Titles of books, songs, etc. |
| LAW |	Named documents made into laws |
| LANGUAGE | Any named language |
| DATE | Absolute or relative dates or periods |
| TIME | Times smaller than a day |
| PERCENT |	Percentage, including ”%“ |
| MONEY | Monetary values, including unit |
| QUANTITY | Measurements, as of weight or distance |
| ORDINAL |	“first”, “second”, etc. |
| CARDINAL | Numerals that do not fall under another type |

In [2]:
# Process whole documents
text = ('When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took \
him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I \
wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.')
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("\nVerbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

print('\nEntities:')
# Find named entities, phrases and concepts (places, people, organizations, and languages)
for entity in doc.ents:
    print('\t', entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']

Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'be', 'talk', 'say']

Entities:
	 Sebastian Thrun PERSON
	 Google ORG
	 2007 DATE
	 American NORP
	 Thrun PERSON
	 Recode ORG
	 earlier this week DATE


## Text Classification in Python Using spaCy

<span style="color:red">**Important Article**:  </span>[Tutorial: Text Classification in Python Using spaCy](https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/)

In [3]:
# To get the meaning of a dependency tag, use explain()
print('nsubj:', spacy.explain('nsubj'))
print('dobj:', spacy.explain('dobj'))
print('pobj:', spacy.explain('pobj'))

nsubj: nominal subject
dobj: direct object
pobj: object of preposition


In [4]:
docp = nlp('In pursuit of a wall, President Trump ran into one.')

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [5]:
# For visualization of Entity detection importing displacy from spaCy
from spacy import displacy

displacy.render(docp, style = "dep", jupyter = True)

In [6]:
nytimes = nlp(u'New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an \
outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.  At least 285 people \
have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip \
codes there, Mayor Bill de Blasio (D) said Tuesday.  The mandate orders all unvaccinated people in the area, including a \
concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old.  Anyone who resists \
could be fined up to $1,000.')

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (Williamsburg, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox Jews, 'NORP', 381),
 (6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

In [7]:
displacy.render(nytimes, style = "ent",jupyter = True)

## Word Vector Representation

Using spaCy‘s en_core_web_sm model, let’s take a look at the length of a vector for a single word, and what that vector 
looks like using .vector and .shape.

In [8]:
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 1.0466383  -1.5323697  -0.72177905 -2.4700649  -0.2715162   1.1589639
  1.7113379  -0.31615403 -2.0978343   1.837553    1.4681302   2.728043
 -2.3457408  -5.17184    -4.6110015  -0.21236466 -0.3029521   4.220028
 -0.6813917   2.4016762  -1.9546705  -0.85086954  1.2456163   1.5107994
  0.4684736   3.1612053   0.15542296  2.0598564   3.780035    4.6110964
  0.6375268  -1.078107   -0.96647096 -1.3939928  -0.56914186  0.51434743
  2.3150034  -0.93199825 -2.7970662  -0.8540115  -3.4250052   4.2857723
  2.5058174  -2.2150877   0.7860181   3.496335   -0.62606215 -2.0213525
 -4.47421     1.6821622  -6.0789204   0.22800982 -0.36950028 -4.5340714
 -1.7978683  -2.080299    4.125556    3.1852438  -3.286446    1.0892276
  1.017115    1.2736416  -0.10613725  3.5102775   1.1902348   0.05483437
 -0.06298041  0.8280688   0.05514218  0.94817173 -0.49377063  1.1512338
 -0.81374085 -1.6104267   1.8233354  -2.278403   -2.1321895   0.3029334
 -1.4510616  -1.0584296  -3.5698352  -0.13046083 -0.266833

## Text Classification With Machine Learning and SpaCy

[Amazon Alexa Reviews](https://www.kaggle.com/sid321axn/amazon-alexa-reviews/home)

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [10]:
# Loading TSV file
df_amazon = pd.read_csv ("../data/amazon_alexa.tsv", sep = "\t")
print(df_amazon.shape, '\n')
print(df_amazon.info())
df_amazon.head()

(3150, 5) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.1+ KB
None


Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


## Compute Null Accuracy

In [11]:
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [12]:
df_amazon.feedback.value_counts()/df_amazon.feedback.count()

1    0.918413
0    0.081587
Name: feedback, dtype: float64

## Tokening the Data With spaCy

Example where nlp = spacy.load("en_core_web_sm"), while parser = English()

In [27]:
# Import the English language class
from spacy.lang.en import English

# https://stackoverflow.com/questions/51072516/pos-in-spacy-is-not-returning-any-results-in-python
#
# Load the English language class dictionary, which includes the language-specific data like tokenization rules, but doesn't 
# actually load a model, which enables spaCy to predict part-of-speech tags and other linguistic annotations.  It contains the
# language data and tokenizer, but doesn't have a statistical model.  This means that spaCy will tokenize the text, but its 
# pipeline is empty; it doesn't include a tagger or parser, and it doesn't have any weights or attributes available either
So the  will be empty
parser = English()
parser.pipeline

[]

In [28]:
# Process text.  The nlp object is used to create documents with linguistic annotations
doc = parser("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


In [31]:
from spacy.lang.en.stop_words import STOP_WORDS
import string

# Create our list of punctuation marks
punctuations = string.punctuation

# Build a list of stopwords to use to filter
stop_words = list(STOP_WORDS)
print('Number of stop words: %d' % len(stop_words))
stop_words

Number of stop words: 326


['beside',
 'yourself',
 'formerly',
 'him',
 'often',
 'own',
 'themselves',
 'seems',
 'front',
 're',
 'eight',
 '‘ve',
 'mine',
 'n’t',
 'three',
 'upon',
 'beforehand',
 'one',
 'have',
 'next',
 'still',
 'in',
 'other',
 'out',
 'once',
 'under',
 'might',
 'she',
 'be',
 'nowhere',
 '‘ll',
 'sixty',
 'make',
 'as',
 'become',
 'everything',
 'rather',
 '’m',
 'others',
 'former',
 'our',
 'towards',
 'see',
 'give',
 'on',
 'its',
 'above',
 'we',
 'seemed',
 'around',
 '’re',
 'something',
 "'s",
 'how',
 'therein',
 'below',
 'do',
 'thereupon',
 'became',
 'anywhere',
 'am',
 'though',
 'among',
 'anyone',
 'call',
 'third',
 'many',
 'somehow',
 'say',
 'been',
 'not',
 'quite',
 'thus',
 'was',
 'put',
 'regarding',
 'her',
 'is',
 'were',
 'even',
 'whom',
 'amount',
 'the',
 'none',
 'whereby',
 'before',
 'a',
 'enough',
 'his',
 'perhaps',
 'until',
 'bottom',
 'who',
 'against',
 'hereby',
 'when',
 'whereas',
 'your',
 'us',
 'five',
 'onto',
 'get',
 'last',
 'any',

In [14]:
# Create spacy tokenizer that accepts a sentence as input and processes the sentence into tokens (which can also be replaced 
# by word vectors), performing lemmatization, lowercasing, and removing stop words and punctuation 
def spacy_tokenizer(sentence):
    # Creating token object, which is used to create documents with linguistic annotations
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return preprocessed list of tokens
    return mytokens

In [18]:
ex1 = 'He was walking with the walker in the Wall; he may had sat and run with the runner, too'
spacy_tokenizer(ex1)

['walk', 'walker', 'wall', 'sit', 'run', 'runner']

## Defining a Custom Transformer

To further clean our text data, we’ll also want to create a custom transformer for removing initial and end spaces and 
converting text into lower case.  Here, we create a custom predictors class which inherits the TransformerMixin class. 
This class overrides the transform, fit and get_parrams methods.  We also create a clean_text() function that removes 
spaces and converts text into lowercase.

In [19]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [23]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

## Vectorization Feature Engineering (TF-IDF)

To represent our text numerically, convert text into the matrix of occurrence of words within a given document. It generates 
a matrix that we refer to as a Bag of Words matrix or a document term matrix (DTM).  Generate a DTM by using scikit-learn‘s CountVectorizer.  In the code below, CountVectorizer uses the custom spacy_tokenizer function as its tokenizer, and define the ngram range we want (combinations of adjacent words).

TF-IDF (Term Frequency-Inverse Document Frequency) – simply a way of normalizing our Bag of Words by looking at each word’s frequency in comparison to the document frequency; how important a particular term is based on how many times the term appears and how many other documents that same term appears in.  The higher the TF-IDF, the more important that term is to that document.  We can represent this with the following mathematical equation:

![tfidf equation](../data/tfidf_eqn.jpg)

In [24]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against

## Creating a Pipeline and Generating the Model

Create a pipeline with three components: a cleaner, a vectorizer, and a classifier. 
* The cleaner uses our predictors class object to clean and preprocess the text
* The vectorizer uses countvector objects to create the bag of words matrix for our text
* The classifier is an object that performs the logistic regression to classify the sentiments

Once this pipeline is built, we’ll fit the pipeline components using fit().

[How Pipelines Work](https://spacy.io/usage/processing-pipelines/#pipeline)

### Built-in pipeline components

 |  STRING NAME |	COMPONENT | DESCRIPTION
|-----------|----------|----------|
| tagger | Tagger |	Assign part-of-speech-tags |
| parser | DependencyParser | Assign dependency labels |
| ner | EntityRecognizer | Assign named entities |
| textcat | TextCategorizer | Assign text categories |
| entity_ruler | EntityRuler | Assign named entities based on pattern rules |
| sentencizer | Sentencizer	 | Add rule-based sentence segmentation without the dependency parse |
| merge_noun_chunks | merge_noun_chunks | Merge all noun chunks into a single token. Should be added after the tagger and parser |
| merge_entities | merge_entities | Merge all entities into a single token. Should be added after the entity recognizer |
| merge_subtokens | merge_subtokens | Merge subtokens predicted by the parser into single tokens. Should be added after the parser |

In [25]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x000001F4AECC6080>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
    ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

## Evaluating the Model

Use various functions of the metrics module to look at our model’s accuracy, precision, and recall.

* [Accuracy](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score) refers to the percentage of the total predictions our model makes that are completely correct.
* [Precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score) describes the ratio of true positives to true positives plus false positives in our predictions.
* [Recall](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score) describes the ratio of true positives to true positives plus false negatives in our predictions.

In [26]:
from sklearn import metrics

predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))

# • Our model correctly identified a comment’s sentiment 94.1% of the time
# • When it predicted a review was positive, that review was actually positive 95% of the time
# • When handed a positive review, our model identified it as positive 98.6% of the time

Logistic Regression Accuracy: 0.9164021164021164
Logistic Regression Precision: 0.9162248144220573
Logistic Regression Recall: 1.0


## Getting Lemma

https://github.com/Jcharis/Natural-Language-Processing-Tutorials/blob/master/Text%20Classification%20With%20Machine%20Learning,SpaCy,Sklearn(Sentiment%20Analysis)/Text%20Classification%20&%20Sentiment%20Analysis%20with%20SpaCy,Sklearn.ipynb

In [32]:
docx = nlp('This is how Johnny Walker was walking.  He was also running beside the lawn.')

# Lemmatizing of tokens
for word in docx:
    print('word =', word.text, "| lemma = ", word.lemma_)

word = This | lemma =  This
word = is | lemma =  be
word = how | lemma =  how
word = Johnny | lemma =  Johnny
word = Walker | lemma =  Walker
word = was | lemma =  be
word = walking | lemma =  walk
word = . | lemma =  .
word =   | lemma =   
word = He | lemma =  He
word = was | lemma =  be
word = also | lemma =  also
word = running | lemma =  run
word = beside | lemma =  beside
word = the | lemma =  the
word = lawn | lemma =  lawn
word = . | lemma =  .


In [39]:
# Lemma that are not pronouns
for word in docx:
    if word.lemma_ != '-PRON-':
        print(word.lemma_.lower().strip())

when
learn
datum
science
,
you
should
not
get
discourage
!

challenges
and
setback
be
not
failure
,
be
just
part
of
the
journey
.
have
get
this
!


In [40]:
# List Comprehensions of our Lemma
[word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in docx]

['when',
 'learn',
 'datum',
 'science',
 ',',
 'you',
 'should',
 'not',
 'get',
 'discourage',
 '!',
 '',
 'challenges',
 'and',
 'setback',
 'be',
 'not',
 'failure',
 ',',
 'they',
 'be',
 'just',
 'part',
 'of',
 'the',
 'journey',
 '.',
 'you',
 'have',
 'get',
 'this',
 '!']

In [41]:
# Filtering out Stopwords and Punctuations
for word in docx:
    if word.is_stop == False and not word.is_punct:
        print(word)

learning
data
science
discouraged
 
Challenges
setbacks
failures
journey
got


In [42]:
# Stop words and Punctuation In list Comprehension
[word for word in docx if word.is_stop == False and not word.is_punct]

[learning,
 data,
 science,
 discouraged,
  ,
 Challenges,
 setbacks,
 failures,
 journey,
 got]

In [43]:
text = "When learning data science, you shouldn't get discouraged!  Challenges and setbacks aren't failures, they're \
just part of the journey. You've got this!"

docx = nlp(text)

# Create list of word tokens
token_list = []
for token in docx:
    token_list.append(token.text)
print(token_list)

# spaCy recognizes that contractions such as shouldn’t, they're, and aren't actually represent two distinct words, and it has 
# thus broken them down into two distinct tokens

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', ' ', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [49]:
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text, word.pos_)
 
# Count the number of words in the sentence, excluding only punctuation marks
mytokens = [word for word in docs if not word.is_punct]
print(len(mytokens))

All 
is 
well 
that 
ends 
well 
. 
6


In [45]:
# Start over with the English parser that does not contain a pipeline, and thus doesn't contain a sentencizer

# By default, the English spaCy parser breaks the text into tokens.  We can also break the text into sentences rather than 
# words - sentence tokenization - where the tokenizer looks for specific characters that fall between sentences, like periods, 
# exclaimation points, and newline characters. Sentence tokenization requires a preprocessing pipeline because sentence 
# preprocessing using spaCy includes a tokenizer, a tagger, a parser and an entity recognizer to correctly identify what’s a 
# sentence and what isn’t

# Create the pipeline 'sentencizer' component
sbd = parser.create_pipe('sentencizer')

# Add the component to the pipeline
parser.add_pipe(sbd)

docx = parser(text)

# create list of sentence tokens
sents_list = []
for sent in docx.sents:
    sents_list.append(sent.text)
print(sents_list)

["When learning data science, you shouldn't get discouraged!", " Challenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


## Unsupervised Parameter Search Function

* This function finds the parameters that produce# the highest Normalized Mutual Infomation score from our clusters. This score is a good baseline from which to compare clustering vs classification because it correlates with good clutering as well as higher accuracy scores.
* It prints the relevant statistics as well as a contingency matrix of the result and stores the results in a DataFrame.

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
model.wv.vocab

In [None]:
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
print(X.shape)
X

In [None]:
pca = PCA(n_components=2)
result = pca.fit_transform(X)
print(result.shape)
result

In [None]:
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

In [None]:
len(X[0])

In [None]:
df['url'] = df['url'].str.replace('foodnewsfeed', 'fsrmagazine')