In [None]:

import time #timer between comment extractions
import os #accessing, saving and storing files
import csv #spreadsheet file manipulation

# Clean text

The comments are structured but it's currently too "dirty" for analysis. So let's tidy it up.

In [None]:
import pandas as pd #pandas is the main python module for analysing data structured in spreadsheet format
import matplotlib.pyplot as plt #matplotlib is good for generating graphs in your notebook
import re #re allows you to manipulate strings with regrex style manipulation
import string #string manipulation

Copy the path of the _comments version of the data and read it in through the pandas.read_csv function.

Then we check if all is in order.

In [None]:
reviews = pd.read_csv("/content/feedback.csv", encoding="utf-8") 
reviews.head() #Check csv is correctly formatted

Unnamed: 0,Name,Email,Carcolour,wheeltype,customised,experience,feedback
0,khushi,khushi@gmail.com,black,type2,5,Good,Nice experience. I liked the black colour of c...
1,kj,kj@gmail.com,red,type1,4,Good,Good user experience. I would like to have a r...
2,khushi,khushi@gmail.com,black,default,5,good,New and exciting user experience. I liked the ...
3,Ameya,Amey@gmail.com,blue,default,5,good,Excellent user experience. I would like to hav...
4,abc,abc@gmail.com,black,type3,3,bad,Nice experience. I liked the black colour of c...


In [None]:
data = reviews.filter(['feedback'])
data.head()

Unnamed: 0,feedback
0,Nice experience. I liked the black colour of c...
1,Good user experience. I would like to have a r...
2,New and exciting user experience. I liked the ...
3,Excellent user experience. I would like to hav...
4,Nice experience. I liked the black colour of c...


In [None]:
print("File has {0} rows, {1} columns".format(data.shape[0],data.shape[1]))
print("Columns within file " + str(data.columns))

File has 59 rows, 1 columns
Columns within file Index(['feedback'], dtype='object')


In [None]:
#Create a function that removes punctuations and turns all text to lowercase then apply it to every row
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


#This allows you apply functions to each row in a column
round1 = lambda x: clean_text_round1(x) 

#Create a new column with the clean text data
data['clean_text'] = data['feedback'].apply(round1)
data.head()

Unnamed: 0,feedback,clean_text
0,Nice experience. I liked the black colour of c...,nice experience i liked the black colour of ca...
1,Good user experience. I would like to have a r...,good user experience i would like to have a re...
2,New and exciting user experience. I liked the ...,new and exciting user experience i liked the b...
3,Excellent user experience. I would like to hav...,excellent user experience i would like to have...
4,Nice experience. I liked the black colour of c...,nice experience i liked the black colour of ca...


**Apply stopwords**

The second part of cleaning involves removing stopwords (a.k.a filler words that hold little meaning to computers). 

We will do this by uploading a list of stopwords, applying a function to each row that splits the text into individual words and filtering out the stopwords.

We will then be able to pull out most common words within Reddit Thread.

In [None]:
from collections import Counter #counts most common words

#Set up stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 


words = [] #store all words
#function that appends every word from each comment into one chunk of text
def append(comment):
  text = comment.split() #tokenise comment into list of words
  for i in text:
    words.append(i) #append to bigger word list

add_words = lambda x: append(x)
data['clean_text'].apply(add_words)

print("Total number of words across all rows:")
print(len(words))

#filter out stopwords
filtered = [w for w in words if not w in stop_words] 

print("Total number of words AFTER stopwords removal:")
print(len(filtered))


Total number of words across all rows:
1831
Total number of words AFTER stopwords removal:
1041


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
count_unfiltered = Counter(words).most_common(20)
count_words = Counter(filtered).most_common(20) #Comment this out first to see most common words before and after.
count_words

[('experience', 74),
 ('would', 64),
 ('colour', 58),
 ('wheels', 49),
 ('car', 46),
 ('dashboard', 44),
 ('kept', 43),
 ('good', 41),
 ('type', 41),
 ('like', 40),
 ('overall', 31),
 ('really', 25),
 ('black', 24),
 ('blue', 24),
 ('default', 24),
 ('user', 23),
 ('liked', 22),
 ('apart', 22),
 ('enjoyed', 22),
 ('definitely', 22)]

**Adjust your stopwords filter for added context**

We can see removing stopwords makes a big difference but depending on the context of the body of text, most stopwords fail to capture additional filler words that hold little meaning.

Now we will look at the most common top words and add any that might hinder our analysis.

In [None]:
#Stopwords rarely catches all the pointless words so we need to add to it
#Pull out up to 30 words
counter = 0
new_stops =[]
for x,y in count_words:
  if counter != 30:
    new_stops.append(x)
    counter+=1

print("Counter stopped at:")
print(str(counter))

print("Most common words:")
print(str(new_stops))

#Watch out for alternative apostrophies and unidentifiable symbols
#Feel free to edit this list by adding your own or copying over from the common words
more_stops = ["it's",'im','lol',"i'm",'got','yeah', "it’s","i’m","its", "also", "etc"]

Counter stopped at:
20
Most common words:
['experience', 'would', 'colour', 'wheels', 'car', 'dashboard', 'kept', 'good', 'type', 'like', 'overall', 'really', 'black', 'blue', 'default', 'user', 'liked', 'apart', 'enjoyed', 'definitely']


In [None]:
print("Current no. of stop words")
print(len(stop_words))
for i in more_stops: #Depending on list of stop words you're using, loop through and add
  stop_words.add(i) #stop_words is a set, so you we use set.add() instead of list.append()

print("Current no. of stop words after additions")
print(len(stop_words))

print("Total number of words in filtered list:")
print(len(filtered))

#filter out stopwords
filtered = [w for w in filtered if not w in stop_words] 

print("Total number of words AFTER stopwords removal with additionals added:")
print(len(filtered))

if "it’s" in filtered:
  print("It seems stop words hasn't been updated")

count_words = Counter(filtered).most_common(20)
count_words

Current no. of stop words
179
Current no. of stop words after additions
188
Total number of words in filtered list:
1041
Total number of words AFTER stopwords removal with additionals added:
1003


[('experience', 74),
 ('would', 64),
 ('colour', 58),
 ('wheels', 49),
 ('car', 46),
 ('dashboard', 44),
 ('kept', 43),
 ('good', 41),
 ('type', 41),
 ('like', 40),
 ('overall', 31),
 ('really', 25),
 ('black', 24),
 ('blue', 24),
 ('default', 24),
 ('user', 23),
 ('liked', 22),
 ('apart', 22),
 ('enjoyed', 22),
 ('definitely', 22)]

# Common Words Analysis

Popularity of single words is okay but I feel a lot of times word pairs are a lot more informative on what is being discussed.

Let's look at most popular pairs of words.


In [None]:
import itertools
import collections
#https://stackoverflow.com/questions/54308997/efficient-python-for-word-pair-co-occurrence-counting

def pairwise(iterable):
    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
    a, b = itertools.tee(iterable)
    next(b, None)
    return ((a, b) if a < b else (b, a) for a, b in zip(a, b))

collections.Counter(pairwise(filtered)).most_common(30)

[(('dashboard', 'kept'), 43),
 (('experience', 'would'), 39),
 (('car', 'colour'), 27),
 (('blue', 'colour'), 24),
 (('experience', 'overall'), 24),
 (('type', 'wheels'), 24),
 (('dashboard', 'wheels'), 23),
 (('experience', 'user'), 23),
 (('default', 'wheels'), 23),
 (('experience', 'liked'), 22),
 (('apart', 'really'), 22),
 (('enjoyed', 'really'), 22),
 (('enjoyed', 'overall'), 22),
 (('definitely', 'would'), 22),
 (('car', 'default'), 22),
 (('black', 'colour'), 21),
 (('blue', 'kept'), 21),
 (('apart', 'colour'), 21),
 (('definitely', 'reccommend'), 21),
 (('black', 'liked'), 20),
 (('like', 'would'), 19),
 (('prefer', 'would'), 19),
 (('added', 'features'), 19),
 (('added', 'like'), 19),
 (('like', 'sunroof'), 19),
 (('car', 'type'), 19),
 (('good', 'wheels'), 18),
 (('experience', 'nice'), 17),
 (('dashboard', 'type'), 17),
 (('good', 'would'), 16)]