# Project 4

In [None]:
# Author: Jemil Patel

!pip install readability # installing readability package

# required imports used in the program
import operator
import readability
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

def strip_punctuations(words): # accepts a list of words and returns an updated list after removing punctuations from before and after each word
    
  new_words = []

  for word in words:

    start = 0
    end = 0

    index = 0

    # by the end of this loop we will get the first occurence of any letter inside a word
    while index < len(word): 
      if word[index].isalpha() or word[index].isdigit():
        start = index
        break
      index += 1

    index = 0

    # by the end of this loop we will get the last occurence of any letter inside a word
    while index < len(word):
      if word[index].isalpha() or word[index].isdigit():
        end = index
      index += 1

    new_words.append(word[start:end+1]) # appending the updated word to the new list
    
  return new_words

def average_word_length(words): # accepts a list of words and returns the average length of a word

  sum = 0

  for word in words:
    sum += len(word)

  return sum / len(words)

def word_frequency(words): # accepts a list of words and returns a dictionary with (word, frequency) pair sorted in descending order by frequency

  freq = dict()

  for word in words:
    if word.lower() != 'a' and word.lower() != 'an' and word.lower() != 'the' and word.lower() != 'and': # 'a', 'an', 'the', 'and' are not allowed
      if word.lower() not in freq:
        freq[word.lower()] = 1
      else:
        freq[word.lower()] += 1

  sorted_freq = dict(sorted(freq.items(), key=operator.itemgetter(1), reverse=True)) # creating a new dictionary by sorting the original one via descending order of frequency

  return sorted_freq

def word_length(words): # accepts a list of words and returns a dictionary with (word, length) pair sorted in descending order by length

  lengths = dict()

  for word in words:
    if word.lower() not in lengths:
      lengths[word.lower()] = len(word.lower())

  sorted_lengths = dict(sorted(lengths.items(), key=operator.itemgetter(1), reverse=True)) # creating a new dictionary by sorting the original one via descending order of length

  return sorted_lengths

def number_of_sentences(words): # accepts a text or a list of words and returns the number of sentences present

  sentences = 0

  for line in words:
    if '.' in line or '?' in line or '!' in line:
      sentences += 1

  return sentences

print()

while True:
  name = input('Enter a valid filename (with extension): ')
  try:
    speech = open(name, 'r')
  except FileNotFoundError:
    print('No such file found! Please try again')
    continue
  break

words = []

# extracting words from the text file
for line in speech:
  for word in line.split():
    words.append(word)

speech.close()

# Declaring and initializing necessary variables

word_count = len(words)
sentence_count = number_of_sentences(words)

words = strip_punctuations(words)

avg_word = average_word_length(words)
avg_sentence = word_count / sentence_count

word_freq = word_frequency(words)
word_len = word_length(words)

print('\n-------------------------MY CALCULATIONS-------------------------')

print(f'Total word count : {word_count}')
print(f'Total sentence count : {sentence_count}\n')

print(f'Average letters per word : {avg_word}')
print(f'Average words per sentence : {avg_sentence}\n')

print('15 most frequently used words are (in descending order)')
x = 0
for k,v in word_freq.items():
  if x == 15:
    break
  print(f'{k} : {v} occurences')
  x += 1

print('\n10 longest words are (in ascending order)')
x = 0
keys = []
for k,v in word_len.items():
  if x == 15:
    break
  if k.isalpha(): # to check if the word does not contain any special character
    keys.append(k)
    x += 1
keys.reverse() # to help print the longest words in ascending order
for k in keys:
  print(f'{k} : {word_len[k]} letters')

print('\n---------------------------READABILITY---------------------------')

speech = open(name, 'r')
text = ''

for line in speech:
  text += line

results = readability.getmeasures(text) # storing the readability results in a variable

for x in results:
  print(f'{x} :')
  for y in results[x]:
    print(f'    {y} : {results[x][y]}')

speech.close()

print('\n----------------------------WORD CLOUD---------------------------')

wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate_from_frequencies(word_freq) # creating a word cloud based on the frequency of each word
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

wordcloud.to_file("patel_wordcloud.png") # saving the word cloud as a png file