# 3.0 Clean review text data

## 0. Import library

In [1]:
# Basic
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

import warnings;
warnings.filterwarnings('ignore')

In [2]:
# nlp tools
import string
import nltk
#nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
#nltk.download('omw-1.4')
from nltk.corpus import stopwords
import re

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


## 1. Import data and sampling

In [3]:
df = pd.read_csv('scraped_data_559k.csv')

In [4]:
df.head()

Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132354840,"Loe, love, LOVE this game.\n\nI purchased it w...",1675747403,True
1,132354525,shouldve been more,1675746707,False
2,132354176,It's fun,1675745970,True
3,132353890,full gen z game.,1675745336,False
4,132353500,it good that is all,1675744599,True


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559123 entries, 0 to 559122
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   recommendationid   559123 non-null  int64 
 1   review             557447 non-null  object
 2   timestamp_created  559123 non-null  int64 
 3   voted_up           559123 non-null  bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 13.3+ MB


## 3. Data Cleaning

### 3.1 Remove nulls and duplicates

**Drop nulls**

In [6]:
df['review'].isna().value_counts()

False    557447
True       1676
Name: review, dtype: int64

In [7]:
df.dropna(subset='review', inplace=True)

In [8]:
df['review'].isna().value_counts()

False    557447
Name: review, dtype: int64

**Drop duplicates**

In [9]:
df['review'].duplicated().value_counts()

False    465791
True      91656
Name: review, dtype: int64

In [10]:
df.drop_duplicates(subset='review', keep='first', inplace=True)

In [11]:
df['review'].duplicated().value_counts()

False    465791
Name: review, dtype: int64

**Convert timestamp_created to datetime**

In [12]:
df['timestamp_created'] = pd.to_datetime(df['timestamp_created'],unit='s')
df['timestamp_created'] = pd.to_datetime(df['timestamp_created'].dt.date)
df.head()

Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132354840,"Loe, love, LOVE this game.\n\nI purchased it w...",2023-02-07,True
1,132354525,shouldve been more,2023-02-07,False
2,132354176,It's fun,2023-02-07,True
3,132353890,full gen z game.,2023-02-07,False
4,132353500,it good that is all,2023-02-07,True


### 3.2 Pipeline to remove stopwords, punctuation and formatting

**Remove formatting**

In [13]:
def remove_markdown(input_string):
    input_string = str(input_string)
    result = re.sub(r'\[.*?\]', '', input_string)
    result = ' '.join(result.split()) #replace multiple spaces with single space

    return result

**Remove punctuations**

In [14]:
def remove_puns(input_string):
    result = [letter for letter in input_string if letter not in string.punctuation]
    result = ''.join(result)
    result = ' '.join(result.split()) #replace multiple spaces with single space
    return result

**Remove stopwords**

In [15]:
# create a custom stopwords list that include missing ' like wont, theyll, ive, etc
stopwords_default = stopwords.words('english')
stopwords_custom = [''.join(stopword.split('\'')) for stopword in stopwords_default] + ['theyll', 'theyd','theyre', 'woulda','wouldve','ive','imma','itll', 'youll']
stopwords_merged = set(stopwords_default).union(set(stopwords_custom))

In [16]:
def remove_stopwords(input_string):
    input_words = input_string.split(' ')
    result = [word for word in input_words if word.lower() not in stopwords_merged] 
    result = ' '.join(result) # disable if need to return list
    return result

**Remove non latin character**

In [17]:
from nltk.tokenize import RegexpTokenizer
def remove_non_latin(input_string):
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    return tokenizer.tokenize(input_string.lower())

**Lemmatizing**

In [18]:
def lemmatize(input_list):
    lemmatizer = WordNetLemmatizer()
    result = list(map(lemmatizer.lemmatize, input_list))
    return result

**Combine together**

In [24]:
def review_cleaning(input_string):
    result = remove_markdown(input_string)
    result = remove_puns(result)
    result = remove_stopwords(result)
    result = remove_non_latin(result)
    result = lemmatize(result)
    result = ' '.join(result)
    return result

In [25]:
%%time
df['review'] = df['review'].apply(review_cleaning)

CPU times: total: 48.9 s
Wall time: 1min 8s


In [26]:
df.head(10)

Unnamed: 0,recommendationid,review,timestamp_created,voted_up
0,132354840,loe love love game purchased sale would recomm...,2023-02-07,True
1,132354525,,2023-02-07,False
2,132354176,fun,2023-02-07,True
3,132353890,full gen z game,2023-02-07,False
4,132353500,good,2023-02-07,True
5,132352818,captivating like movie,2023-02-07,True
6,132352029,adorei timo jogo uma pena que n o teve um lan ...,2023-02-07,True
7,132351754,fun pretty good,2023-02-07,True
8,132351266,got flaw game week played 40 hour case gta 5 g...,2023-02-07,True
9,132349876,favourite part keanu reef said cybering time c...,2023-02-07,True


In [30]:
df.review[df.review.apply(len)==0] = np.NaN

In [31]:
df.review.isna().sum()

3209

In [33]:
df.review.dropna(inplace=True)

**Export**

In [34]:
df.to_csv('reviews_cleaned.csv', index=False)