In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Path of the file on Google Drive
file_path = '/content/drive/MyDrive/NLP_Data/anat19.txt'

In [4]:
# TBD - SYSTEM ARGUMENT

# Open the file and read its contents
with open(file_path, 'r') as file:
    raw_text = file.read()

# Print the first 1000 characters of raw_text
# print(raw_text[:1000])

In [5]:
# Importing NLTK
import nltk
import random
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Tokenize the raw text
tokenized_raw_text = word_tokenize(raw_text)

# Display the first 10 tokens
print(tokenized_raw_text[:10])

['Heart', 'Anatomy', 'The', 'vital', 'importance', 'of', 'the', 'heart', 'is', 'obvious']


In [7]:
# Lexical diversity code adapted from https://github.com/kjmazidi/NLP/blob/master/Part_2-Words/Chapter_05_words/5.1_Words1.ipynb

# Finding the total number of tokens
print("\nThe number of tokens in anat19.txt: ", len(tokenized_raw_text))

# Finding the total number of unique tokens
unique_tokens = set(tokenized_raw_text)
print("\nThe number of unique tokens in anat19.txt:", len(unique_tokens))

# Printing the first 5 unique tokens
print("\nThe first 5 unique tokens in anat19.txt:", list(unique_tokens)[:5])


The number of tokens in anat19.txt:  20218

The number of unique tokens in anat19.txt: 3096

The first 5 unique tokens in anat19.txt: ['primitive', 'leaflets', 'inexpensive', 'prognosis', 'makes']


In [8]:
# Calculating lexical diversity
print("\nLexical diversity of anat19.txt: %.2f" % (len(unique_tokens) / len(tokenized_raw_text)))


Lexical diversity of anat19.txt: 0.15


In [9]:
def preprocess_text(raw_text):

  # Tokenize the raw text and lowercase
  tokenized_raw_text = word_tokenize(raw_text.lower())
  # print(tokenized_raw_text)

  # Filter out tokens to tokens that are alpha
  alpha_tokens = [token for token in tokenized_raw_text if token.isalpha()]
  # print(alpha_tokens)

  # Filter out tokens to tokens that not in the in the stopword list
  stopwords_tokens = [token for token in alpha_tokens if token not in stopwords.words('english')]
  # print(stopwords_tokens)

  # Filter out tokens to tokens with a length > 5
  length_greater_than_5_tokens = [token for token in stopwords_tokens if len(token) > 5]
  # print(length_greater_than_5_tokens)

  # Lemmatize the tokens
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in length_greater_than_5_tokens]
  # print(lemmatized_tokens)

  # Create a set to get unique lemmas
  unique_lemmatized_tokens = set(lemmatized_tokens)
  # print(unique_lemmatized_tokens)
  # print(len(unique_lemmatized_tokens))

  # Do POS tagging on the unique lemmas
  tags = pos_tag(unique_lemmatized_tokens)
  print("The first 20 words and their tag:", tags[:20])

  # Filter the unique lemmas to only nouns
  nouns = [word for word, tag in tags if tag == 'NN']
  #print(nouns)

  # Printing the number of non unique tokens from step a.
  print("Number of tokens that are alpha, not in the NLTK stopword list, and have length > 5: ", len(length_greater_than_5_tokens), "\n")

  # Printing the number of nouns.
  print("Number of nouns: ", len(nouns), "\n")

  # Misunderstanding of the instructions - Scrapped this code
  # --------------------------------------------------------------
  # Get a random non unique token
  # random_non_unique_token = random.choice(length_greater_than_5_tokens)
  # print(random_non_unique_token)

  # Get the first random noun
  # random_noun_1 = random.choice(nouns)
  # print(random_noun_1)

  # Get the second random noun
  # random_noun_2 = random.choice(nouns)
  # print(random_noun_2)
  # --------------------------------------------------------------

  # Return list of non unique tokens, and list of nouns
  return length_greater_than_5_tokens, nouns

tokens, nouns = preprocess_text(raw_text)

# Creating a dictionary of {noun: count of noun}
noun_count_dictionary = {noun: tokens.count(noun) for noun in nouns}
# print(noun_count_dictionary)

# Adapted this code to sort https://www.geeksforgeeks.org/different-ways-of-sorting-dictionary-by-values-and-reverse-sorting-by-values/#
# Sorting the noun
noun_count_sorted = sorted(noun_count_dictionary.items(), key = lambda kv: kv[1], reverse=True)
# print(noun_count_sorted)

# Initialized the list of nouns
nouns_list = []

# Printing the 50 most common words and their counts
for key_value in noun_count_sorted[:50]:

  # Extract the noun, count from the tuple (key_value)
  common_noun, common_noun_count = key_value

  # Print the sorted nouns and their counts
  print(common_noun, ":", common_noun_count)
  nouns_list.append(common_noun)

# print(nouns_list)

The first 20 words and their tag: [('primitive', 'JJ'), ('prognosis', 'NN'), ('inexpensive', 'JJ'), ('training', 'NN'), ('tensing', 'VBG'), ('report', 'NN'), ('administering', 'VBG'), ('stimulus', 'JJ'), ('forceful', 'JJ'), ('technique', 'NN'), ('serf', 'NN'), ('elapsed', 'VBD'), ('drastically', 'RB'), ('mitosis', 'JJ'), ('formation', 'NN'), ('bypass', 'NN'), ('leading', 'VBG'), ('revealed', 'JJ'), ('delicate', 'JJ'), ('thyroid', 'NN')]
Number of tokens that are alpha, not in the NLTK stopword list, and have length > 5:  7020 

Number of nouns:  665 

muscle : 75
contraction : 70
pressure : 43
increase : 33
stimulation : 33
septum : 32
include : 27
calcium : 26
depolarization : 24
oxygen : 24
percent : 22
system : 22
conduction : 20
semilunar : 20
diastole : 19
tissue : 18
opening : 17
activity : 17
contract : 16
chordae : 16
function : 16
pericardium : 16
myocardium : 16
minute : 16
period : 15
patient : 14
disease : 13
exercise : 13
supply : 13
treatment : 13
pattern : 13
potential :

In [10]:
# Definition: Print word with spaces in between
def print_word_with_spaces(word):
  for letter in word:
    print(letter, "", end='')

In [11]:
def word_game(nouns_list):
  # Choose a random noun in the list
  random_noun = random.choice(nouns_list)

  # Uncomment to see the random noun
  print("Random Noun: ", random_noun)

  # Initial Points
  points = 5

  # Create the current guessed words (which is _ * length of the word)
  current_guessed_word = []
  for i in range(len(random_noun)):
    current_guessed_word.append("_")

  print_word_with_spaces(current_guessed_word)

  while(1):
    if points < 0:
      print("\nGAME OVER, you ran out of points.")
      break

    # Check to see if the user has won
    elif '_' not in current_guessed_word:
      print("\nCongrats. You've won with", points, "points left.")
      break
    else:
      # Ask the user for a letter
      user_letter = input("\nGuess a letter (enter \'!\' to exit): ")

      # Checking if the input is "!" and exiting if it is
      if user_letter == "!":
        print("Exiting now.")
        break

      # Checking if the input is an alpha char with size 1 (one letter)
      elif not user_letter.isalpha() or len(user_letter)!=1:
        print("\nPlease input one letter character.\nScore is", points)

      # Checking if user has already guessed that letter
      elif user_letter in current_guessed_word:
        print("\nYou already guessed that letter. Try again.\nScore is", points)

      # Checking if letter is in the random noun
      elif user_letter in random_noun:
        points += 1
        print("\nRight!\nScore is", points)

        # Replacing current_guessed_word with the current guess letter
        for letter_index in range(len(random_noun)):
          if random_noun[letter_index] == user_letter:
            current_guessed_word[letter_index] = random_noun[letter_index]

        # print(current_guessed_word)

      else:
        points -= 1
        print("Sorry, guess again. Score is", points)

      # Print the current word
      print_word_with_spaces(current_guessed_word)

In [None]:
# Driver of game
# One instance of the game
print("Let's play a word guessing game!")
word_game(nouns_list)


# Prompt
while(1):
  user_prompt = input("\nDo you want to play again? Enter \'@\' if No. Enter anything else if Yes: ")
  if user_prompt == "@":
    print("Exiting now.")
    break
  else:
    print("Guess another noun.")
    word_game(nouns_list)

Let's play a word guessing game!
Random Noun:  semilunar
_ _ _ _ _ _ _ _ _ 
Guess a letter (enter '!' to exit): s

Right!
Score is 6
s _ _ _ _ _ _ _ _ 
Guess a letter (enter '!' to exit): e

Right!
Score is 7
s e _ _ _ _ _ _ _ 
Guess a letter (enter '!' to exit): m

Right!
Score is 8
s e m _ _ _ _ _ _ 
Guess a letter (enter '!' to exit): i

Right!
Score is 9
s e m i _ _ _ _ _ 
Guess a letter (enter '!' to exit): l

Right!
Score is 10
s e m i l _ _ _ _ 
Guess a letter (enter '!' to exit): u

Right!
Score is 11
s e m i l u _ _ _ 