# Dataset

In [4]:
! pip install kaggle



In [5]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [6]:
! cp kaggle.json ~/.kaggle/

In [7]:
! chmod 600 ~/.kaggle/kaggle.json

In [10]:
! kaggle datasets download septa97/100k-courseras-course-reviews-dataset

Downloading 100k-courseras-course-reviews-dataset.zip to /content
 74% 9.00M/12.2M [00:00<00:00, 45.6MB/s]
100% 12.2M/12.2M [00:00<00:00, 57.7MB/s]


In [11]:
! unzip 100k-courseras-course-reviews-dataset.zip

Archive:  100k-courseras-course-reviews-dataset.zip
  inflating: reviews.csv             
  inflating: reviews_by_course.csv   


In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
df = pd.read_csv('/content/reviews.csv')
df.drop(columns="Id", inplace=True)
df

Unnamed: 0,Review,Label
0,good and interesting,5
1,"This class is very helpful to me. Currently, I...",5
2,like!Prof and TAs are helpful and the discussi...,5
3,Easy to follow and includes a lot basic and im...,5
4,Really nice teacher!I could got the point eazl...,4
...,...,...
107013,Trendy topic with talks from expertises in the...,4
107014,"Wonderful! Simple and clear language, good ins...",5
107015,an interesting and fun course. thanks. dr quincy,5
107016,"very broad perspective, up to date information...",4


In [136]:
#The dataset we're using has over 100k entries (that's a bit too big) so we'll take just a sample of it
def reduce_size(x):
    if len(x.index) > 30000:
        return x.sample(n=int(len(x.index)/3))
    elif len(x.index) > 15000:
        return x.sample(n=int(len(x.index)/1.5))
    else:
        return x

df = df.groupby('Label').apply(reduce_size).reset_index(drop=True)
df['Label'].value_counts()

5    17594
4    12036
3     5071
1     2469
2     2251
Name: Label, dtype: int64

In [32]:
txt_list=df['Review'].to_list()

In [33]:
print(len(txt_list))

48218


# Text Summarizing

In [145]:
#To ensure the scrapped textual data is as noise-free as possible, 
#we’ll perform some basic text cleaning.  To assist us to do the processing, 
#we’ll import a list of stopwords from the nltk library
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [146]:
# Sample sentences tokenizer
from nltk.tokenize import word_tokenize, sent_tokenize

sentences = sent_tokenize(txt_list[15])
print('sentences',sentences)

sentences ["is a very very boring curse, don't recommend anyone, please for your good at everyboduy else kill this course, don't embarrass yourself and the universities that represents"]


In [147]:
#create a dictionary table having the frequency of occurrence of each of the 
#words in the text. We’ll loop through the text and the corresponding words 
#to eliminate any stop words.

# Removing stop words
def _create_dictionary_table(text_string) -> dict:
    stop_words = set(stopwords.words("english"))
      
    words = word_tokenize(text_string)
      
      # Reducing words to their root form
    stem = PorterStemmer()
      
      # Creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
      wd = stem.stem(wd)
      if wd in stop_words:
        continue
      if wd in frequency_table:
        frequency_table[wd] += 1
      else:
        frequency_table[wd] = 1
    return(frequency_table)

To evaluate the score for every sentence in the text, we’ll be analyzing the frequency of occurrence of each term. In this case, we’ll be scoring each sentence by its words; that is, adding the frequency of each important word found in the sentence.

In [148]:
# Finding the weighted frequencies of the sentences
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    # Algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] /        sentence_wordcount_without_stop_words
      
    return sentence_weight

#Importantly, to ensure long sentences do not have unnecessarily high scores 
#over short sentences, we divided each score of a sentence by the number of words 
#found in that sentence. Also, to optimize the dictionary’s memory, we 
#arbitrarily added sentence[:7], which refers to the first 7 characters in each sentence.

To further tweak the kind of sentences eligible for summarization, we’ll create the average score for the sentences. With this threshold, we can avoid selecting the sentences with a lower score than the average score.

In [149]:
#Calculating the threshold of the sentences
def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [150]:
# Getting the summary
def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''
    # generate summary 
    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

In [151]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, 1.00 * threshold)

    return article_summary

In [152]:
# limit the length of content so that the text summarization is more obvious.
content=[]
for i in range(len(txt_list)):
  if len(txt_list[i])>88:
    content.append(txt_list[i])


In [153]:
# Since we added sentence[:7] for optimizing dict memory, the larger dataset will
# cause error.
result=[]
for i in range(0,2073):
  summary_results = _run_article_summary(content[i])
  #print(summary_results)
  result.append(summary_results)

In [154]:
# Form a dataframe for a better view of effect of text summarization.
content_new=content[0:2073]
new_df=pd.DataFrame({'before': content_new,
     'after': result
    })

In [155]:
new_df

Unnamed: 0,before,after
0,This course doesn't contain any new informatio...,This course doesn't contain any new information.
1,I do not find very interesting this course. to...,too many interviews.
2,This course doesn't contain any new informatio...,This course doesn't contain any new information.
3,"First of all, I really enjoyed Professor Ittne...","First of all, I really enjoyed Professor Ittn..."
4,No practical knowledge learnd and no clear exa...,No practical knowledge learnd and no clear ex...
...,...,...
2068,It is too small and shallow. Whole specializat...,It is too small and shallow.
2069,"The content is OK, but that's about it. Near z...","The content is OK, but that's about it. Stude..."
2070,The quizzes are not set up correctly and you g...,The content is good and that is why I gave it...
2071,the course really covered information I knew b...,the course really covered information I knew ...
