In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import io

In [None]:
# Importing the dataset
from google.colab import files
uploaded = files.upload()

In [None]:
# inputting some text
texts = pd.read_csv(io.BytesIO(uploaded['comments.csv']))
texts["comments"] = texts["comments"].str.replace(r"\\", "") 
texts

In [None]:
class LemmaTokenizer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
    def __call__(self, doc):
      res = []
      for token in self.nlp(doc):
        if not token.is_punct and not token.like_num and len(token.lemma_) > 2: 
          res.append(token.lemma_) # lemmatize
      return res

In [None]:
class LemmaTokenizer:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
    def __call__(self, doc):
      res = []
      for token in self.nlp(doc):
        if not token.is_punct and not token.like_num and len(token.lemma_) > 2: # not punctuation or numbers and length > 2
          res.append(token.lemma_) # lemmatize
      return res

In [None]:
# Test the lemmatizer on one sentence first

text = texts.loc[10,"comments"]
lt = LemmaTokenizer()
doc = lt(text)
doc

In [None]:
# Removing stopwords, and converting the texts into a word count matrix
from sklearn.feature_extraction.text import CountVectorizer
ngram_range = (1,1) # unigram only
vectorizer = CountVectorizer(lowercase=True,
                             tokenizer=LemmaTokenizer(),
                             stop_words = "english",  
                             ngram_range = ngram_range
                             )
X = vectorizer.fit_transform(texts["comments"])
count_vect_df1 = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
count_vect_df1

In [None]:
# getting word frequency list
word_frequency = count_vect_df1.sum(axis=0).sort_values(ascending=False).reset_index()
word_frequency.columns = ["word", "frequency"]
word_frequency

In [None]:
# plotting bar chart
plt.style.use("seaborn")
fig, ax = plt.subplots(1, 1, figsize = (10,6))
data = word_frequency.head(10)
g=sns.barplot(data=data,
              x="word",
              y="frequency",
              ax=ax
            )
ax.set_xticklabels(data["word"], ha='right', rotation=45)
plt.show()

In [None]:
# bar chart showing the frequency for top 20 words. 

plt.style.use("seaborn")
fig, ax = plt.subplots(1, 1, figsize = (23,13))
data = word_frequency.head(20)
g=sns.barplot(data=data,
              x="word",
              y="frequency",
              ax=ax
            )

plt.xlabel('Word', fontsize=25);
plt.ylabel('Word Frequency', fontsize=25);
plt.title('20 Most Common Words in Document', fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=16)

plt.show()

In [None]:

from wordcloud import WordCloud
freq = word_frequency.set_index("word").to_dict()['frequency'] 

fig, ax = plt.subplots(1, 1, figsize = (10,10))
wc = WordCloud(background_color="white",
                prefer_horizontal = 1,
                width=800,
                height=700,
                max_words=50,
                relative_scaling=0.5,
                colormap = "BuPu", 
                random_state= 2022)
wc.generate_from_frequencies(freq)
ax.imshow(wc)
ax.axis("off")
plt.show()

In [None]:


import matplotlib.pyplot as plt

freq = word_frequency.set_index("word").to_dict()['frequency'] 

fig, ax = plt.subplots(1, 1, figsize = (10,10))
wc = WordCloud(background_color="black",
                prefer_horizontal = 1,
                width=800,
                height=700,
                max_words=40,
                relative_scaling=0.5,
                font_path = "sans-serif.ttf",
                colormap = "Wistia", 
                random_state= 2022)
wc.generate_from_frequencies(freq)
ax.imshow(wc)
ax.axis("off")
plt.show()

In [None]:
# bigram matrix calculation
ngram_range = (2,2)
vectorizer = CountVectorizer(lowercase=True,
                             tokenizer=LemmaTokenizer(),
                             stop_words = "english",
                             ngram_range = ngram_range
                             )
X = vectorizer.fit_transform(texts["comments"])
count_vect_df2 = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
count_vect_df2

In [None]:
# getting the phrase frequency list
word_frequency = count_vect_df2.sum(axis=0).sort_values(ascending=False).reset_index()
word_frequency.columns = ["word", "frequency"]
word_frequency

In [None]:
# plotted bar chart
plt.style.use("seaborn")
fig, ax = plt.subplots(1, 1, figsize = (10,6))
data = word_frequency.head(30)
g=sns.barplot(data=data,
              x="word",
              y="frequency",
              ax=ax
            )
ax.set_xticklabels(data["word"], ha='right', rotation=45)
plt.show()

In [None]:
# word cloud based on the bi-gram, i.e., 2-word phrase cloud


freq = word_frequency.set_index("word_frequency").to_dict()['frequency'] 

fig, ax = plt.subplots(1, 1, figsize = (10,10))
wc = WordCloud(background_color="white",
                prefer_horizontal = 1,
                width=800,
                height=700,
                max_words=20,
                relative_scaling=0.5,
                font_path = "sans-serif.ttf", 
                colormap = "Wistia", 
                random_state= 2022)
wc.generate_from_frequencies(freq)
ax.imshow(wc)
ax.axis("off")
plt.show()

In [None]:
ngram_range = (3,3)
vectorizer = CountVectorizer(lowercase=True,
                             tokenizer=LemmaTokenizer(),
                             stop_words = "english",
                             ngram_range = ngram_range
                             )
X = vectorizer.fit_transform(texts["comments"])
count_vect_df3 = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
count_vect_df3

In [None]:
# getting the phrase frequency list
word_frequency = count_vect_df3.sum(axis=0).sort_values(ascending=False).reset_index()
word_frequency.columns = ["word", "frequency"]
word_frequency


In [None]:
# plotted bar chart
plt.style.use("seaborn")
fig, ax = plt.subplots(1, 1, figsize = (10,6))
data = word_frequency.head(30)
g=sns.barplot(data=data,
              x="word",
              y="frequency",
              ax=ax
            )
ax.set_xticklabels(data["word"], ha='right', rotation=45)
plt.show()

In [None]:
#word cloud based on the bi-gram, i.e., 2-word phrase cloud


freq = word_frequency.set_index("word_frequency").to_dict()['frequency'] # converted pandas dataframe to dictionary

fig, ax = plt.subplots(1, 1, figsize = (10,10))
wc = WordCloud(background_color="white",
                prefer_horizontal = 1,
                width=800,
                height=700,
                max_words=40,
                relative_scaling=0.5,
                font_path = "sans-serif.ttf", 
                colormap = "Wistia", 
                random_state= 2022)
wc.generate_from_frequencies(freq)
ax.imshow(wc)
ax.axis("off")
plt.show()