### 1. Import library

In [1]:
import emoji
from underthesea import word_tokenize, sent_tokenize
from viet_text_tools import normalize_diacritics
import re 
import pandas as pd

import string
import os

### 2. Data Loading 

In [2]:
file_names = ["banphim_comments.csv", "banvedientu_comments.csv", "chuot_comments.csv", "danamthanh_comments.csv", "dienthoai_phukien_comments.csv", "gameconsole_comments.csv",
              "laptop_comments.csv", "linhkienmaytinh_comments.csv", "pc_comments.csv", "maynghenhac_comments.csv", "micro_comments.csv", "tainghecoday_comments.csv",
              "tivi_comments.csv", "sac_comments.csv", "maytinhbang_comments.csv","machdientu_comments.csv","tainghekhongday_comments.csv"]

dir = 'E:\MY CAREER\Internship Project\EDA\dataset\shopee\comment'
dataframes = []
for file in file_names:
    path = os.path.join(dir, file)
    try:
        df = pd.read_csv(path)
        dataframes.append(df)
    except pd.errors.ParserError as e:
        print(f"Error reading file")

combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.head()


Unnamed: 0.1,Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes
0,0,122029931276914,1546899006,93922606,10092536990,5,"Sản phẩm đẹp, đóng gói cẩn thận, tuy chưa kết ...",False,vn,792910715,25.0
1,1,126678434282902,1546899006,93922606,10515236390,5,Chất lượng sản phẩm:tốt\nTính năng nổi bật:bàn...,False,vn,36121190,32.0
2,2,117545382204991,1546899006,93922606,9644909661,5,Chất lượng sản phẩm:cũng được\nTính năng nổi b...,False,vn,729345506,41.0
3,3,130741239293049,1546899006,93922606,10851288939,5,Tính năng nổi bật:có đèn siêu đẹp\nChất lượng ...,False,vn,343048455,8.0
4,4,130771099281440,1546899006,93922606,10875021785,5,Chất lượng sản phẩm:tốt\nTính năng nổi bật:khô...,False,vn,748343507,4.0


In [3]:
combined_df.columns

Index(['Unnamed: 0', 'orderId', 'itemId', 'shopId', 'cmtId', 'rating',
       'comment', 'ship_oversea', 'region', 'userId', 'likes'],
      dtype='object')

In [4]:
combine = combined_df[['orderId', 'itemId', 'shopId', 'cmtId', 'rating',
       'comment', 'ship_oversea', 'region', 'userId', 'likes']]
combine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234379 entries, 0 to 234378
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   orderId       234379 non-null  int64  
 1   itemId        234379 non-null  int64  
 2   shopId        234379 non-null  int64  
 3   cmtId         234379 non-null  int64  
 4   rating        234379 non-null  int64  
 5   comment       168903 non-null  object 
 6   ship_oversea  234379 non-null  bool   
 7   region        234379 non-null  object 
 8   userId        234379 non-null  int64  
 9   likes         53499 non-null   float64
dtypes: bool(1), float64(1), int64(6), object(2)
memory usage: 16.3+ MB


### 3. Data Cleaning

Xóa các dòng dữ liệu không có comment

In [5]:
combine = combine.dropna(subset='comment')
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168903 entries, 0 to 234369
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   orderId       168903 non-null  int64  
 1   itemId        168903 non-null  int64  
 2   shopId        168903 non-null  int64  
 3   cmtId         168903 non-null  int64  
 4   rating        168903 non-null  int64  
 5   comment       168903 non-null  object 
 6   ship_oversea  168903 non-null  bool   
 7   region        168903 non-null  object 
 8   userId        168903 non-null  int64  
 9   likes         50898 non-null   float64
dtypes: bool(1), float64(1), int64(6), object(2)
memory usage: 13.0+ MB


Xóa các comment bị trùng lặp

In [6]:
combine.duplicated(subset='cmtId').sum()

13030

In [7]:
combine.drop_duplicates(subset='cmtId', inplace=True, keep='last')

In [8]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155873 entries, 390 to 234369
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   orderId       155873 non-null  int64  
 1   itemId        155873 non-null  int64  
 2   shopId        155873 non-null  int64  
 3   cmtId         155873 non-null  int64  
 4   rating        155873 non-null  int64  
 5   comment       155873 non-null  object 
 6   ship_oversea  155873 non-null  bool   
 7   region        155873 non-null  object 
 8   userId        155873 non-null  int64  
 9   likes         47864 non-null   float64
dtypes: bool(1), float64(1), int64(6), object(2)
memory usage: 12.0+ MB


Kiểm tra các comment có emoji, nếu comment có emoji thì xóa 

In [9]:
filtered_data = combine

def check_emoji(text):
    text_without_emoji = emoji.demojize(text)
    return text != text_without_emoji

filtered_data["has_emoji"] = filtered_data["comment"].apply(check_emoji)
filtered_data = filtered_data[filtered_data['has_emoji'] == False]

filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143367 entries, 390 to 234369
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   orderId       143367 non-null  int64  
 1   itemId        143367 non-null  int64  
 2   shopId        143367 non-null  int64  
 3   cmtId         143367 non-null  int64  
 4   rating        143367 non-null  int64  
 5   comment       143367 non-null  object 
 6   ship_oversea  143367 non-null  bool   
 7   region        143367 non-null  object 
 8   userId        143367 non-null  int64  
 9   likes         42933 non-null   float64
 10  has_emoji     143367 non-null  bool   
dtypes: bool(2), float64(1), int64(6), object(2)
memory usage: 11.2+ MB


Xóa các comment có teencode

In [10]:
with open('teencode.txt',"r", encoding="utf-8") as file:
    teencode_words = [line.strip() for line in file]


def get_lower(text):
        return text.lower().strip()

filtered_data['comment'] = filtered_data["comment"].apply(get_lower)
filtered_data["word_count"] = filtered_data["comment"].apply(lambda x: len(x.split()))
filtered_data["teencode_count"] = filtered_data["comment"].apply(lambda x: sum(1 for word in x.split() if word in teencode_words))

filtered_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['comment'] = filtered_data["comment"].apply(get_lower)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["word_count"] = filtered_data["comment"].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["teencode_count"] = filtered_data["

Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count
390,113205959156318,3066084859,45654584,9168285316,5,lúc mà xem giá á không nghĩ là hàng lại xịn nh...,False,vn,442214168,17.0,False,71,0
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhạy, phím bấm êm, đèn led lên màu đẹ...",False,vn,844212025,8.0,False,27,0
392,115564971234828,3066084859,45654584,9475329530,5,"bàn phím nhạy khi gõ không gây tiếng ồn, gõ ch...",False,vn,718486275,18.0,False,63,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống hình, chuột cầm nhẹ ta...",False,vn,636299992,2.0,False,34,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn phím và chuột nhấn rất nhẹ nhàng. cảm ơn ...,False,vn,480477564,7.0,False,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
234365,132412034223284,15090822609,69687900,11039210979,5,shop phục vụ nhiệt tình chu đáo xử lý đơn hàng...,False,vn,252157545,,False,17,0
234366,117980255227608,15090822609,69687900,9757454532,5,"giao hàng nhanh, đóng gói chỉn chu, cẩn thận, ...",False,vn,440977936,,False,13,0
234367,141364883246851,15090822609,69687900,12034166975,5,"chất lượng sản phẩm:ok\n\nđeo hơi đau tai, gam...",False,vn,437423908,,False,19,1
234368,116948237227127,15090822609,69687900,9619993014,5,sản phẩm này rất tốt cảm ơn shop rất nhiều cho...,False,vn,731941560,,False,22,0


In [11]:
filtered_data['teencode_count'].value_counts()

0     114110
1      20075
2       5783
3       2019
4        736
5        322
6        166
7         71
8         33
10        18
9         16
11         5
13         3
14         2
12         2
17         2
16         1
15         1
37         1
18         1
Name: teencode_count, dtype: int64

In [12]:
# Xóa nếu chất lượng của comment kém (ít hơn 7 từ) và comment không chất lượng (> 50 từ), teencode
filtered_data = filtered_data[(filtered_data["word_count"] > 7)& (filtered_data["word_count"] < 50) & (filtered_data["teencode_count"] < 1)]

In [13]:
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91164 entries, 391 to 234369
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   orderId         91164 non-null  int64  
 1   itemId          91164 non-null  int64  
 2   shopId          91164 non-null  int64  
 3   cmtId           91164 non-null  int64  
 4   rating          91164 non-null  int64  
 5   comment         91164 non-null  object 
 6   ship_oversea    91164 non-null  bool   
 7   region          91164 non-null  object 
 8   userId          91164 non-null  int64  
 9   likes           27884 non-null  float64
 10  has_emoji       91164 non-null  bool   
 11  word_count      91164 non-null  int64  
 12  teencode_count  91164 non-null  int64  
dtypes: bool(2), float64(1), int64(8), object(2)
memory usage: 8.5+ MB


In [14]:
# Xóa comment có tồn tại badwords

with open('badwords.txt',"r", encoding="utf-8") as file:
    badwords = [line.strip() for line in file]

filtered_data["badwords_count"] = filtered_data["comment"].apply(lambda x: sum(1 for word in x.split() if word in badwords))
filtered_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["badwords_count"] = filtered_data["comment"].apply(lambda x: sum(1 for word in x.split() if word in badwords))


Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhạy, phím bấm êm, đèn led lên màu đẹ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống hình, chuột cầm nhẹ ta...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn phím và chuột nhấn rất nhẹ nhàng. cảm ơn ...,False,vn,480477564,7.0,False,12,0,0
396,83580099086876,3066084859,45654584,5825350201,5,"sản phẩm được đóng gói kỹ càng, giao hàng nhan...",False,vn,435954539,2.0,False,19,0,0
397,87383887377285,3066084859,45654584,6358299273,5,"hàng tốt rất đẹp , hợp túi tiền , đóng gói rât...",False,vn,481142917,0.0,False,17,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234362,114445519203334,15090822609,69687900,9397069957,5,tính năng nổi bật:đèn led rất phù với giới trẻ...,False,vn,687834883,1.0,False,32,0,2
234365,132412034223284,15090822609,69687900,11039210979,5,shop phục vụ nhiệt tình chu đáo xử lý đơn hàng...,False,vn,252157545,,False,17,0,0
234366,117980255227608,15090822609,69687900,9757454532,5,"giao hàng nhanh, đóng gói chỉn chu, cẩn thận, ...",False,vn,440977936,,False,13,0,0
234368,116948237227127,15090822609,69687900,9619993014,5,sản phẩm này rất tốt cảm ơn shop rất nhiều cho...,False,vn,731941560,,False,22,0,0


In [15]:
filtered_data['badwords_count'].value_counts()

0     75291
1     13843
2      1725
3       247
4        35
5        13
6         3
14        2
10        2
20        1
7         1
12        1
Name: badwords_count, dtype: int64

In [16]:
filtered_data = filtered_data[(filtered_data["badwords_count"] < 1)]
filtered_data.head(5)

Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhạy, phím bấm êm, đèn led lên màu đẹ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống hình, chuột cầm nhẹ ta...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn phím và chuột nhấn rất nhẹ nhàng. cảm ơn ...,False,vn,480477564,7.0,False,12,0,0
396,83580099086876,3066084859,45654584,5825350201,5,"sản phẩm được đóng gói kỹ càng, giao hàng nhan...",False,vn,435954539,2.0,False,19,0,0
397,87383887377285,3066084859,45654584,6358299273,5,"hàng tốt rất đẹp , hợp túi tiền , đóng gói rât...",False,vn,481142917,0.0,False,17,0,0


In [18]:
# Xóa các comment không có likes ( để rút gọn các comment chất lượng")
result = filtered_data.dropna(subset='likes')

### 4. Data Exploring

In [19]:
def convert_unicode(text):
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
    charutf8 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
    char1252 = char1252.split('|')
    charutf8 = charutf8.split('|')
    
    dic = {}
    for i in range(len(char1252)): dic[char1252[i]] = charutf8[i]
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dic[x.group()], text
    )



In [20]:
result['comment'] = result['comment'].apply(convert_unicode)

result.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['comment'] = result['comment'].apply(convert_unicode)


Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhạy, phím bấm êm, đèn led lên màu đẹ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống hình, chuột cầm nhẹ tay ...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn phím và chuột nhấn rất nhẹ nhàng. cảm ơn ...,False,vn,480477564,7.0,False,12,0,0
396,83580099086876,3066084859,45654584,5825350201,5,"sản phẩm được đóng gói kỹ càng, giao hàng nhan...",False,vn,435954539,2.0,False,19,0,0
397,87383887377285,3066084859,45654584,6358299273,5,"hàng tốt rất đẹp , hợp túi tiền , đóng gói rât...",False,vn,481142917,0.0,False,17,0,0


In [21]:
vowels_to_ids = {}
vowels_table = [
    ['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a' ],
    ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
    ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
    ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e' ],
    ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
    ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i' ],
    ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o' ],
    ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
    ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
    ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u' ],
    ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
    ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y' ]
]

for i in range(len(vowels_table)):
    for j in range(len(vowels_table[i]) - 1):
        vowels_to_ids[vowels_table[i][j]] = (i, j)


def is_valid_vietnamese_word(word):
    chars = list(word)
    vowel_indexes = -1
    for index, char in enumerate(chars):
        x, y = vowels_to_ids.get(char, (-1, -1))
        if x != -1:
            if vowel_indexes == -1: vowel_indexes = index
            else:
                if index - vowel_indexes != 1: return False
                vowel_indexes = index
    return True


def standardize_word_typing(word):
    if not is_valid_vietnamese_word(word): return word
    chars = list(word)
    dau_cau = 0
    vowel_indexes = []
    qu_or_gi = False

    for index, char in enumerate(chars):
        x, y = vowels_to_ids.get(char, (-1, -1))
        if x == -1: continue
        elif x == 9:  # check qu
            if index != 0 and chars[index - 1] == 'q':
                chars[index] = 'u'
                qu_or_gi = True
        elif x == 5:  # check gi
            if index != 0 and chars[index - 1] == 'g':
                chars[index] = 'i'
                qu_or_gi = True

        if y != 0:
            dau_cau = y
            chars[index] = vowels_table[x][0]

        if not qu_or_gi or index != 1:
            vowel_indexes.append(index)

    if len(vowel_indexes) < 2:
        if qu_or_gi:
            if len(chars) == 2:
                x, y = vowels_to_ids.get(chars[1])
                chars[1] = vowels_table[x][dau_cau]
            else:
                x, y = vowels_to_ids.get(chars[2], (-1, -1))
                if x != -1: chars[2] = vowels_table[x][dau_cau]
                else: chars[1] = vowels_table[5][dau_cau] if chars[1] == 'i' else vowels_table[9][dau_cau]
            return ''.join(chars)
        return word

    for index in vowel_indexes:
        x, y = vowels_to_ids[chars[index]]
        if x == 4 or x == 8:  # ê, ơ
            chars[index] = vowels_table[x][dau_cau]
            return ''.join(chars)

    if len(vowel_indexes) == 2:
        if vowel_indexes[-1] == len(chars) - 1:
            x, y = vowels_to_ids[chars[vowel_indexes[0]]]
            chars[vowel_indexes[0]] = vowels_table[x][dau_cau]
        else:
            x, y = vowels_to_ids[chars[vowel_indexes[1]]]
            chars[vowel_indexes[1]] = vowels_table[x][dau_cau]
    else:
        x, y = vowels_to_ids[chars[vowel_indexes[1]]]
        chars[vowel_indexes[1]] = vowels_table[x][dau_cau]
    return ''.join(chars)


def standardize_sentence_typing(text):
    words = text.lower().split()
    for index, word in enumerate(words):
        cw = word.split('/')
        cw = [standardize_word_typing(w) for w in cw]

        if len(cw) == 3: cw[1] = standardize_word_typing(cw[1])
        words[index] = ''.join(cw)
    return ' '.join(words)

def remove_punctuation(text):
    punctuation = '!"#$%&\'()*+-/:;<=>?@[\\]^`{|}~'
    translator = str.maketrans('', '', punctuation)
    return text.translate(translator)

In [22]:
def preprocess(text):
    text = text.strip()
    text = remove_punctuation(text)
    text = standardize_sentence_typing(text)
    return text 

result['comment'] = result['comment'].apply(preprocess)
result.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['comment'] = result['comment'].apply(preprocess)


Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhaỵ, phím bấm êm, đèn led lên màu đẹ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống hình, chuột cầm nhẹ tay ...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn phím và chuột nhấn rất nhẹ nhàng. cảm ơn s...,False,vn,480477564,7.0,False,12,0,0
396,83580099086876,3066084859,45654584,5825350201,5,"sản phẩm được đóng gói kỹ càng, giao hàng nhan...",False,vn,435954539,2.0,False,19,0,0
397,87383887377285,3066084859,45654584,6358299273,5,"hàng tốt rất đẹp , hợp túi tiền , đóng gói rât...",False,vn,481142917,0.0,False,17,0,0


In [23]:
def has_numbers(text):
    return bool(re.search(r'\d', text))

def remove_extra_dots(text):
    return re.sub(r'\.{2,}', '.', text)

with open('stopwords.txt', 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()

def remove_stopword(text):
    words = text.split()
    new_words = [word for word in words if word not in stopwords]
    return ' '.join(new_words)


def tokenize(text):
    return re.sub(r'\._+',' . ',word_tokenize(text, format="text"))

def start_processing(text):
    text = remove_extra_dots(text)
    text = tokenize(text)
    text = remove_stopword(text)



In [24]:
result['comment'] = result['comment'].apply(remove_extra_dots)
result.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['comment'] = result['comment'].apply(remove_extra_dots)


Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhaỵ, phím bấm êm, đèn led lên màu đẹ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống hình, chuột cầm nhẹ tay ...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn phím và chuột nhấn rất nhẹ nhàng. cảm ơn s...,False,vn,480477564,7.0,False,12,0,0


In [25]:
result['comment'] = result['comment'].apply(tokenize)
result.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['comment'] = result['comment'].apply(tokenize)


Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhaỵ , phím bấm êm , đèn_led lên màu ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống_hình , chuột cầm nhẹ_tay...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn_phím và chuột nhấn rất nhẹ_nhàng . cảm_ơn ...,False,vn,480477564,7.0,False,12,0,0


In [26]:
result['comment'] = result['comment'].apply(remove_stopword)
result.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['comment'] = result['comment'].apply(remove_stopword)


Unnamed: 0,orderId,itemId,shopId,cmtId,rating,comment,ship_oversea,region,userId,likes,has_emoji,word_count,teencode_count,badwords_count
391,115547853262706,3066084859,45654584,9473157284,5,"bàn phím nhaỵ , phím bấm êm , đèn_led màu đẹp ...",False,vn,844212025,8.0,False,27,0,0
393,110991500100515,3066084859,45654584,8984854294,5,"hàng đẹp xịn sò giống_hình , chuột cầm nhẹ_tay...",False,vn,636299992,2.0,False,34,0,0
395,89659632262758,3066084859,45654584,6522412723,5,bàn_phím chuột nhấn nhẹ_nhàng . cảm_ơn shop,False,vn,480477564,7.0,False,12,0,0


In [27]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22647 entries, 391 to 234369
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   orderId         22647 non-null  int64  
 1   itemId          22647 non-null  int64  
 2   shopId          22647 non-null  int64  
 3   cmtId           22647 non-null  int64  
 4   rating          22647 non-null  int64  
 5   comment         22647 non-null  object 
 6   ship_oversea    22647 non-null  bool   
 7   region          22647 non-null  object 
 8   userId          22647 non-null  int64  
 9   likes           22647 non-null  float64
 10  has_emoji       22647 non-null  bool   
 11  word_count      22647 non-null  int64  
 12  teencode_count  22647 non-null  int64  
 13  badwords_count  22647 non-null  int64  
dtypes: bool(2), float64(1), int64(9), object(2)
memory usage: 2.3+ MB


In [28]:
result = result[~result['comment'].str.contains(r'\d')]

In [29]:
result.columns

Index(['orderId', 'itemId', 'shopId', 'cmtId', 'rating', 'comment',
       'ship_oversea', 'region', 'userId', 'likes', 'has_emoji', 'word_count',
       'teencode_count', 'badwords_count'],
      dtype='object')

In [31]:
result_df = result[['orderId', 'itemId', 'shopId', 'cmtId', 'rating', 'comment',
       'ship_oversea', 'region', 'userId', 'likes']]

result_df.to_csv('cleandata/shopee_comment.csv')

In [30]:
result['likes'] = result['likes'].astype('int')
result = result[result['likes'] != 0]