## Basic features and readability scores


In [None]:
# One-hot encoding

# Print the features of df1
print(df1.columns)

# Perform one-hot encoding
df1 = pd.get_dummies(df1, columns=['feature 5'])

# Print the new features of df1
print(df1.columns)

# Print first five rows of df1
print(df1.head())

In [None]:
# Character count of Russian tweets

# Create a feature char_count
tweets['char_count'] = tweets['content'].apply(len)

# Print the average character count
print(tweets['char_count'].mean())

"""
<script.py> output:
    103.462
    
Notice that the average character count of these tweets is approximately 104, which is much higher than the overall 
average tweet length of around 40 characters. Depending on what you're working on, this may be something worth 
investigating into. For your information, there is research that indicates that fake news articles tend to 
have longer titles! Therefore, even extremely basic features such as character counts can prove to be very useful in 
certain applications.
"""

In [None]:
# Word count of TED talks

# Function that returns number of words in a string
def count_words(string):
	# Split the string into words
    words = string.split()
    
    # Return the number of words
    return len(words)

# Create a new feature word_count
ted['word_count'] = ted['transcript'].apply(count_words)

# Print the average word count of the talks
print(ted['word_count'].mean())

"""
<script.py> output:
    1987.1
    
You now know how to compute the number of words in a given piece of text. Also, notice that the average length 
of a talk is close to 2000 words. You can use the word_count feature to compute its correlation with other variables 
such as number of views, number of comments, etc. and derive extremely interesting insights about TED.
"""

In [None]:
# Hashtags and mentions in Russian tweets

# Function that returns numner of hashtags in a string
def count_hashtags(string):
	# Split the string into words
    words = string.split()
    
    # Create a list of words that are hashtags
    hashtags = [word for word in words if word.startswith('#')]
    
    # Return number of hashtags
    return(len(hashtags))

# Create a feature hashtag_count and display distribution
tweets['hashtag_count'] = tweets['content'].apply(count_hashtags)
tweets['hashtag_count'].hist()
plt.title('Hashtag count distribution')
plt.show()

##################################################################################

# Function that returns number of mentions in a string
def count_mentions(string):
	# Split the string into words
    words = string.split()
    
    # Create a list of words that are mentions
    mentions = [word for word in words if word.startswith('@')]
    
    # Return number of mentions
    return(len(mentions))

# Create a feature mention_count and display distribution
tweets['mention_count'] = tweets['content'].apply(count_mentions)
tweets['mention_count'].hist()
plt.title('Mention count distribution')
plt.show()

In [None]:
# Readability of 'The Myth of Sisyphus'

# Import Textatistic
from textatistic import Textatistic

# Compute the readability scores 
readability_scores = Textatistic(sisyphus_essay).scores

# Print the flesch reading ease score
flesch = readability_scores['flesch_score']
print("The Flesch Reading Ease is %.2f" % (flesch))

In [None]:
# Readability of various publications

# Import Textatistic
from textatistic import Textatistic

# List of excerpts
excerpts = [forbes, harvard_law, r_digest, time_kids]

# Loop through excerpts and compute gunning fog index
gunning_fog_scores = []
for excerpt in excerpts:
  readability_scores = Textatistic(excerpt).scores
  gunning_fog = readability_scores['gunningfog_score']
  gunning_fog_scores.append(gunning_fog)

# Print the gunning fog indices
print(gunning_fog_scores)

## Text preprocessing, POS tagging and NER


In [None]:
"""
Identifying lemmas
Identify the list of words from the choices which do not have the same lemma.

Car, Bike, Truck, Bus
"""

In [None]:
# Tokenizing the Gettysburg Address

import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(gettysburg)

# Generate the tokens
tokens = [token.text for token in doc]
print(tokens)

In [None]:
# Lemmatizing the Gettysburg address

# Print the gettysburg address
print(gettysburg)

##################################################################################

import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(gettysburg)

# Generate lemmas
lemmas = [token.lemma_ for token in doc]

# Convert lemmas into a string
print(' '.join(lemmas))

"""
Input:
    Four score and seven years ago our fathers brought forth on this continent, a new nation, 
    conceived in Liberty, and dedicated to the proposition that all men are created equal. 
    Now we're engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, 
    can long endure. We're met on a great battlefield of that war. We've come to dedicate a portion of that field, 
    as a final resting place for those who here gave their lives that that nation might live. It's altogether fitting 
    and proper that we should do this. But, in a larger sense, we can't dedicate - we can not consecrate - we can not hallow 
    - this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add 
    or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. 
    It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so 
    nobly advanced. It's rather for us to be here dedicated to the great task remaining before us - that from these honored 
    dead we take increased devotion to that cause for which they gave the last full measure of devotion - that we here highly
    resolve that these dead shall not have died in vain - that this nation, under God, shall have a new birth of freedom - 
    and that government of the people, by the people, for the people, shall not perish from the earth.

Output:
    four score and seven year ago -PRON- father bring forth on this continent , a new nation , conceive in liberty ,
    and dedicate to the proposition that all man be create equal . now -PRON- be engage in a great civil war , 
    test whether that nation , or any nation so conceive and so dedicated , can long endure . -PRON- be meet on a great 
    battlefield of that war . -PRON- have come to dedicate a portion of that field , as a final resting place for those who 
    here give -PRON- life that that nation may live . -PRON- be altogether fitting and proper that -PRON- should do this . 
    but , in a large sense , -PRON- can not dedicate - -PRON- can not consecrate - -PRON- can not hallow - this ground . 
    the brave man , living and dead , who struggle here , have consecrate -PRON- , far above -PRON- poor power to add or 
    detract . the world will little note , nor long remember what -PRON- say here , but -PRON- can never forget what -PRON- 
    do here . -PRON- be for -PRON- the living , rather , to be dedicate here to the unfinished work which -PRON- who fight 
    here have thus far so nobly advanced . -PRON- be rather for -PRON- to be here dedicate to the great task remain before 
    -PRON- - that from these honor dead -PRON- take increase devotion to that because for which -PRON- give the last full 
    measure of devotion - that -PRON- here highly resolve that these dead shall not have die in vain - that this nation , 
    under god , shall have a new birth of freedom - and that government of the people , by the people , for the people , 
    shall not perish from the earth .

"""

In [None]:
# Cleaning a blog post

# Load model and create Doc object
nlp = spacy.load('en_core_web_sm')
doc = nlp(blog)

# Generate lemmatized tokens
lemmas = [token.lemma_ for token in doc]

# Remove stopwords and non-alphabetic tokens
a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]

# Print string after text cleaning
print(' '.join(a_lemmas))

"""
Input:
Twenty-first-century politics has witnessed an alarming rise of populism in the U.S. and Europe. 
The first warning signs came with the UK Brexit Referendum vote in 2016 swinging in the way of Leave. 
This was followed by a stupendous victory by billionaire Donald Trump to become the 45th President of the United States 
in November 2016. Since then, Europe has seen a steady rise in populist and far-right parties that have capitalized on
Europe’s Immigration Crisis to raise nationalist and anti-Europe sentiments. Some instances include Alternative 
for Germany (AfD) winning 12.6% of all seats and entering the Bundestag, thus upsetting Germany’s political order for 
the first time since the Second World War, the success of the Five Star Movement in Italy and the surge in popularity of 
neo-nazism and neo-fascism in countries such as Hungary, Czech Republic, Poland and Austria.

Output:
century politic witness alarming rise populism europe warning sign come uk brexit referendum vote 
swinging way leave follow stupendous victory billionaire donald trump president united states november 
europe steady rise populist far right party capitalize europe immigration crisis raise nationalist anti 
europe sentiment instance include alternative germany afd win seat enter bundestag upset germany political 
order time second world war success star movement italy surge popularity neo nazism neo fascism country hungary 
czech republic poland austria

Note:
Take a look at the cleaned text; it is lowercased and devoid of numbers, punctuations and commonly used stopwords. 
Also, note that the word U.S. was present in the original text. Since it had periods in between, our text cleaning 
process completely removed it. This may not be ideal behavior. It is always advisable to use your custom functions 
in place of isalpha() for more nuanced cases.
"""

In [None]:
# Cleaning TED talks in a dataframe

# Function to preprocess text
def preprocess(text):
  	# Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]
    
    return ' '.join(a_lemmas)
  
# Apply preprocess to ted['transcript']
ted['transcript'] = ted['transcript'].apply(preprocess)
print(ted['transcript'])

In [None]:
# POS tagging in Lord of the Flies

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(lotf)

# Generate tokens and pos tags
pos = [(token.text, token.pos_) for token in doc]
print(pos)

In [None]:
# Counting nouns in a piece of text

nlp = spacy.load('en_core_web_sm')

# Returns number of proper nouns
def proper_nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of proper nouns
    return pos.count('PROPN')

print(proper_nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

"""<script.py> output:
    3"""

##################################################################################

nlp = spacy.load('en_core_web_sm')

# Returns number of other nouns
def nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of other nouns
    return pos.count('NOUN')

print(nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

"""<script.py> output:
    2"""

In [None]:
# Noun usage in fake news

headlines['num_propn'] = headlines['title'].apply(proper_nouns)

# Compute mean of proper nouns
real_propn = headlines[headlines['label'] == 'REAL']['num_propn'].mean()
fake_propn = headlines[headlines['label'] == 'FAKE']['num_propn'].mean()

# Print results
print("Mean no. of proper nouns in real and fake headlines are %.2f and %.2f respectively"%(real_propn, fake_propn))

"""    Mean no. of proper nouns in real and fake headlines are 2.46 and 4.86 respectively
"""

##################################################################################

headlines['num_noun'] = headlines['title'].apply(nouns)

# Compute mean of other nouns
real_noun = headlines[headlines['label'] == 'REAL']['num_noun'].mean()
fake_noun = headlines[headlines['label'] == 'FAKE']['num_noun'].mean()

# Print results
print("Mean no. of other nouns in real and fake headlines are %.2f and %.2f respectively"%(real_noun, fake_noun))

"""    Mean no. of other nouns in real and fake headlines are 2.30 and 1.44 respectively
"""

In [None]:
# Named entities in a sentence

# Load the required model
nlp = spacy.load('en_core_web_sm')

# Create a Doc instance 
text = 'Sundar Pichai is the CEO of Google. Its headquarters is in Mountain View.'
doc = nlp(text)

# Print all named entities and their labels
for ent in doc.ents:
    print(ent.text, ent.label_)

"""
Sundar Pichai ORG
    Google ORG
    Mountain View GPE
    
Notice how the model correctly predicted the labels of Google and Mountain View but mislabeled Sundar Pichai as 
an organization. As discussed in the video, the predictions of the model depend strongly on the data it is trained on. 
It is possible to train spaCy models on your custom data. You will learn to do this in more advanced NLP courses.
"""

In [None]:
# Identifying people mentioned in a news article

def find_persons(text):
  # Create Doc object
  doc = nlp(text)
  
  # Identify the persons
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
  
  # Return persons
  return persons

print(find_persons(tc))

"""
['Sheryl Sandberg', 'Mark Zuckerberg']

The article was related to Facebook and our function correctly identified both the people mentioned. 
You can now see how NER could be used in a variety of applications. Publishers may use a technique like this to 
classify news articles by the people mentioned in them. A question answering system could also use something like 
this to answer questions such as 'Who are the people mentioned in this passage?'. With this, we come to an end of 
this chapter. In the next, we will learn how to conduct vectorization on documents.
"""

## N-Gram models


In [None]:
"""
Word vectors with a given vocabulary
You have been given a corpus of documents and you have computed the vocabulary of the corpus to be the following: 
V: a, an, and, but, can, come, evening, forever, go, i, men, may, on, the, women

Which of the following corresponds to the bag of words vector for the document "men may come and men may go but i go 
on forever"?

(0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 2, 2, 1, 0, 0)
"""

In [None]:
# BoW model for movie taglines

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Print the shape of bow_matrix
print(bow_matrix.shape)

In [None]:
# Analyzing dimensionality and preprocessing

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_lem_matrix = vectorizer.fit_transform(lem_corpus)

# Print the shape of bow_lem_matrix
print(bow_lem_matrix.shape)

In [None]:
# Mapping feature indices with feature names

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
print(bow_df)

In [None]:
# BoW vectors for movie reviews

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words='english')

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train)

# Transform X_test
X_test_bow = vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

"""
 You now have a good idea of preprocessing text and transforming them into their bag-of-words representation 
 using CountVectorizer. In this exercise, you have set the lowercase argument to True. However, note that this 
 is the default value of lowercase and passing it explicitly is not necessary. Also, note that both X_train_bow 
 and X_test_bow have 8158 features. There were words present in X_test that were not in X_train. CountVectorizer 
 chose to ignore them in order to ensure that the dimensions of both sets remain the same.
"""

In [1]:
# Predicting the sentiment of a movie review

# Create a MultinomialNB object
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
prediction = clf.predict(vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

In [None]:
# n-gram models for movie tag lines

# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1,1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1,2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print("ng1, ng2 and ng3 have %i, %i and %i features respectively" % (ng1.shape[1], ng2.shape[1], ng3.shape[1]))

"""
    ng1, ng2 and ng3 have 6614, 37100 and 76881 features respectively

 You now know how to generate n-gram models containing higher order n-grams. Notice that ng2 has over 37,000 features 
 whereas ng3 has over 76,000 features. This is much greater than the 6,000 dimensions obtained for ng1. As the n-gram 
 range increases, so does the number of features, leading to increased computational costs and a problem known as the 
 curse of dimensionality.
"""

In [None]:
# Higher order n-grams for sentiment analysis

# Define an instance of MultinomialNB 
clf_ng = MultinomialNB()

# Fit the classifier 
clf_ng.fit(X_train_ng, y_train)

# Measure the accuracy 
accuracy = clf_ng.score(X_test_ng, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was not good. The plot had several holes and the acting lacked panache."
prediction = clf_ng.predict(ng_vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

"""
The accuracy of the classifier on the test set is 0.758
The sentiment predicted by the classifier is 0

You're now adept at performing sentiment analysis using text. Notice how this classifier performs slightly 
better than the BoW version. Also, it succeeds at correctly identifying the sentiment of the mini-review as 
negative. In the next chapter, we will learn more complex methods of vectorizing textual data.
"""

In [None]:
# Comparing performance of n-gram models

start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.5, random_state=42, stratify=df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1,1))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("The program took %.3f seconds to complete. The accuracy on the test set is %.2f. The ngram representation had %i features." % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1]))

"""
The program took 0.186 seconds to complete. The accuracy on the test set is 0.75. The ngram representation had 12347 features.

"""

##################################################################################

start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.5, random_state=42, stratify=df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1,3))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("The program took %.3f seconds to complete. The accuracy on the test set is %.2f. The ngram representation had %i features." % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1]))

"""
The program took 2.320 seconds to complete. The accuracy on the test set is 0.77. The ngram representation had 178240 features.

The program took around 0.2 seconds in the case of the unigram model and more than 10 times longer for the higher order
n-gram model. The unigram model had over 12,000 features whereas the n-gram model for upto n=3 had over 178,000! 
Despite taking higher computation time and generating more features, the classifier only performs marginally better 
in the latter case, producing an accuracy of 77% in comparison to the 75% for the unigram model.

"""

## TF-IDF and similarity scores


In [None]:
"""
tf-idf weight of commonly occurring words
The word bottle occurs 5 times in a particular document D and also occurs in every document of the corpus. What is the tf-idf weight of bottle in D?

0
"""

In [None]:
# tf-idf vectors for TED talks

# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

In [None]:
"""
Cosine similarity: https://www.machinelearningplus.com/nlp/cosine-similarity/

Range of cosine scores
Which of the following is a possible cosine score for a pair of document vectors?

0.86
"""

In [None]:
# Computing dot product

# Initialize numpy vectors
A = np.array([1,3])
B = np.array([-2,2])

# Compute dot product
dot_prod = np.dot(A, B)

# Print dot product
print(dot_prod)

In [2]:
# Cosine similarity matrix of a corpus

# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

In [None]:
# Comparing linear_kernel and cosine_similarity

# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

"""
[[1.         0.         0.         ... 0.         0.         0.        ]
     [0.         1.         0.         ... 0.         0.         0.        ]
     [0.         0.         1.         ... 0.         0.01418221 0.        ]
     ...
     [0.         0.         0.         ... 1.         0.01589009 0.        ]
     [0.         0.         0.01418221 ... 0.01589009 1.         0.        ]
     [0.         0.         0.         ... 0.         0.         1.        ]]
    Time taken: 0.32955265045166016 seconds
"""

##################################################################################

# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

"""
[[1.         0.         0.         ... 0.         0.         0.        ]
     [0.         1.         0.         ... 0.         0.         0.        ]
     [0.         0.         1.         ... 0.         0.01418221 0.        ]
     ...
     [0.         0.         0.         ... 1.         0.01589009 0.        ]
     [0.         0.         0.01418221 ... 0.01589009 1.         0.        ]
     [0.         0.         0.         ... 0.         0.         1.        ]]
    Time taken: 0.32399821281433105 seconds
"""

"""
Notice how both linear_kernel and cosine_similarity produced the same result. However, linear_kernel took a 
smaller amount of time to execute. When you're working with a very large amount of data and your vectors are 
in the tf-idf representation, it is good practice to default to linear_kernel to improve performance. 
(NOTE: In case, you see linear_kernel taking more time, it's because the dataset we're dealing with is extremely 
small and Python's time module is incapable of capture such minute time differences accurately)
"""

In [None]:
# Plot recommendation engine

# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_plots)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
 
# Generate recommendations 
print(get_recommendations('The Dark Knight Rises', cosine_sim, indices))

"""
1                              Batman Forever
    2                                      Batman
    3                              Batman Returns
    8                  Batman: Under the Red Hood
    9                            Batman: Year One
    10    Batman: The Dark Knight Returns, Part 1
    11    Batman: The Dark Knight Returns, Part 2
    5                Batman: Mask of the Phantasm
    7                               Batman Begins
    4                              Batman & Robin
    Name: title, dtype: object
    
You've just built your very first recommendation system. Notice how the recommender correctly identifies 'The Dark Knight Rises'
as a Batman movie and recommends other Batman movies as a result. This sytem is, of course, very primitive and there are a 
host of ways in which it could be improved. One method would be to look at the cast, crew and genre in addition to the plot
to generate recommendations. We will not be covering this in this course but you have all the tools necessary 
to accomplish this. Do give it a try!
"""

In [None]:
# The recommender function

# Generate mapping between titles and index
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

def get_recommendations(title, cosine_sim, indices):
    # Get index of title that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
# TED talk recommender

# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(transcripts)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
 
# Generate recommendations 
print(get_recommendations('5 ways to kill your dreams', cosine_sim, indices))

In [None]:
# Generating word vectors

# Create the doc object
doc = nlp(sent)

# Compute pairwise similarity scores
for token1 in doc:
  for token2 in doc:
    print(token1.text, token2.text, token1.similarity(token2))
    
"""
I I 1.0
    I like 0.023032807
    I apples 0.10175116
    I and 0.047492094
    I oranges 0.10894456
    like I 0.023032807
    like like 1.0
    like apples 0.015370452
    like and 0.189293
    like oranges 0.021943133
    apples I 0.10175116
    apples like 0.015370452
    apples apples 1.0
    apples and -0.17736834
    apples oranges 0.6315578
    and I 0.047492094
    and like 0.189293
    and apples -0.17736834
    and and 1.0
    and oranges 0.018627528
    oranges I 0.10894456
    oranges like 0.021943133
    oranges apples 0.6315578
    oranges and 0.018627528
    oranges oranges 1.0
    
Notice how the words 'apples' and 'oranges' have the highest pairwaise similarity score. This is expected as 
they are both fruits and are more related to each other than any other pair of words.
"""

In [3]:
# Computing similarity of Pink Floyd songs

# Create Doc objects
mother_doc = nlp(mother)
hopes_doc = nlp(hopes)
hey_doc = nlp(hey)

# Print similarity between mother and hopes
print(mother_doc.similarity(hopes_doc))

# Print similarity between mother and hey
print(mother_doc.similarity(hey_doc))

"""
<script.py> output:
    0.6006234924640204
    0.9135920924498578
    
Notice that 'Mother' and 'Hey You' have a similarity score of 0.9 whereas 'Mother' and 'High Hopes' 
has a score of only 0.6. This is probably because 'Mother' and 'Hey You' were both songs from the same album 
'The Wall' and were penned by Roger Waters. On the other hand, 'High Hopes' was a part of the album 'Division Bell'
with lyrics by David Gilmour and his wife, Penny Samson. Treat yourself by listening to these songs. 
They're some of the best!
"""

NameError: name 'nlp' is not defined