# **3. Pre-Processing**

In [None]:
!pip install word2number
!pip install emoji
!pip install pyspellchecker
!pip install wordcloud
!pip install wordninja
!pip install langdetect
!pip install nltk
!pip install -U deep-translator
!pip install emojis
!pip install -U sentence-transformers
!pip install zeugma
!pip install --upgrade category_encoders

## **Imports**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# text pre-processing 
import re, string #library that contains punctuation
from word2number import w2n #Convert numeric words to numbers
import wordninja # Split attached words
from langdetect import detect # Language detection
from langdetect import DetectorFactory # enforce consistent results for lang detection
DetectorFactory.seed = 0 # For consistent language detection
from deep_translator import GoogleTranslator # translator
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # To remove stopwords
from nltk.tokenize import word_tokenize # Tokenize
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer #Stemming 
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer # Lemmatization
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer # for calculating similarity
from sklearn.metrics.pairwise import cosine_similarity # Cosine similarity
from category_encoders import TargetEncoder

# # To create word cloud
from wordcloud import WordCloud 
from wordcloud import STOPWORDS
from PIL import Image
from wordcloud import ImageColorGenerator

# To check difference between similarity
import scipy.stats as stats
from scipy.stats import skew # Skewness for normality
from scipy.stats import kurtosis # Kurtosis value of the normal distribution
# For word/sentence embeddings
from sentence_transformers import SentenceTransformer
from zeugma.embeddings import EmbeddingTransformer 

## **3 Load Data**

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df=pd.read_csv('data_preprocess.csv')

In [None]:
df.shape

In [None]:
df.iloc[:2, 100:]

## **3.1 Merge Title and Review Body**




**Note:** There are two features that can be used in NLP pre-processing.
* Text: Title and body of review
* Description: Explanation of application

In [None]:
# create a new feature as 'text'
df['text'] = df['title'] + ' ' + df['body']
df['text'].head(10)

## **3.2 Create Additional Features Related to Text**

In [None]:
# Number of hastags
df['num_hashtags_text'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
df['num_hashtags_desc'] = df['description'].apply(lambda x: len([c for c in str(x) if c == '#']))

# number of mentions 
df['num_ment_text'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@'])) 
df['num_ment_desc'] = df['description'].apply(lambda x: len([c for c in str(x) if c == '@'])) 

# Number of words
df['num_word_text'] = df['text'].apply(lambda x: len(str(x).split()))   
df['num_word_desc'] = df['description'].apply(lambda x: len(str(x).split()))  

# Number of stopwords
df['num_s_word_text'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])) 
df['num_s_word_desc'] = df['description'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])) 

# Number of URLs
df['num_url_text'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))    
df['num_url_desc'] = df['description'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))  

# Average word length
df['ave_word_text'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))   
df['ave_word_desc'] = df['description'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))   

# Number of punctuation
df['num_punc_text'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation])) 
df['num_punc_desc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [None]:
df.iloc[:2, 114:]

##  **3.3 The Length of Text**

**Note:** There is a column in df as 'length'. I want to check whether this column is same as length of **text** column.

In [None]:
df['len'] = df['text'].str.len()

In [None]:
((df['length'] - df['len']) != 0).sum()

**Note:** When I merged 'title' and 'body' column I added a space. So there is one more character in all 16000 cases. These two columns are same. I can drop df['len'] column.

In [None]:
df.drop(columns = 'len', inplace = True)

## **3.4 Detect Language**

I will detect the language of both 'review text' and 'app description' features.

### **3.4.1 Language Detection of Review Text**


In [None]:
# Create a new column 
df["lang_text"] = np.nan

# detect language for each review text
for i in range(len(df)):
    try: 
        df.loc[i, 'lang_text'] = detect(df.loc[i, 'text'])
    except:
        # Some cases includes just punction, emoji, number etc. In that case
        # language cann't be detected. For these case create new variable as
        # 'unknown'
        df.loc[i, 'lang_text'] = 'Unknown'

In [None]:
# DENEME
# Let's see the different languages in review text
df['lang_text'].value_counts().head(15)

**Note:** A total of 15068 reviews were written in English. I will create a new feature that shows the language of review is in English or not.


In [None]:
# Create a new feature that shows language is english or not
df['english'] = np.where(df['lang_text'] == 'en',1,0)

In [None]:
# Check values of new column
df['english'].value_counts()[1]

In [None]:
# calcuate the rate of reviews in English
15068/16000*100

In [None]:
df[['text', 'lang_text']].loc[865]

### **3.4.2 Language Detection of App Description**

In [None]:
# Create a new column 
df["lang_desc"] = np.nan

# detect language for each review text
for i in range(len(df)):
    try: 
        df.loc[i, 'lang_desc'] = detect(df.loc[i, 'description'])
    except:
        # Some cases includes just punction, emoji, number etc. In that case
        # language cann't be detected. For these case create new variable as
        # 'unknown'
        df.loc[i, 'lang_desc'] = 'Unknown'

In [None]:
# Let's see the different languages in review text
df['lang_desc'].value_counts().head()

In [None]:
# calcuate the rate of reviews in English
15791/16000*100

## **3.5 Text Cleaning**

In [None]:
# Check these columns
df[['text', 'description']].head()

In [None]:
# Check the types
df[['text', 'description']].dtypes

In [None]:
# Check NaNs
df[['text', 'description']].isna().sum()

**Note:** Converting object to string will makes NaNs as string. I want to keep original column.


In [None]:
# Make a copy of df['description'] column
df['orig_description'] = df['description']

In [None]:
# Convert 'description' column to string
df['description'] = df['description'].astype(str)

***Remove Noisy Text***

In [None]:
# Define a function for removing noisy text
def text_cleaning(text):
    # convert to lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # remove HTML
    text = re.compile(r'<.*?>').sub(r'',text)
    # remove emoji
    text = re.compile("["u"\U0001F600-\U0001F64F"  # emoticons
                         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                         u"\U0001F680-\U0001F6FF"  # transport & map symbols
                         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                         u"\U00002702-\U000027B0"
                         u"\U000024C2-\U0001F251"
                         "]+", flags=re.UNICODE).sub(r'', text)
    # remove URL
    text = re.compile(r'https?://\S+|www\.\S+').sub(r'',text)
    # remove white spaces
    text = re.sub("\s\s+" , " ", text)
    # remove text in the square brackets
    text = re.sub('\[[^]]*\]','', text)
    # remove line symbol
    text = re.sub('\n', '', text)
    # remove words that contain numbers
    text = re.sub('\w*\d\w*', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)

    return text

In [None]:
# Removing punctuation with definede function
df['text_clean'] = df['text'].apply(lambda x : text_cleaning(x))
df['description_clean'] = df['description'].apply(lambda x : text_cleaning(x))

In [None]:
# Check one of the case not in English
df[['text', 'text_clean']].loc[669]

**Note:** The rate of noisy text has been created as a new feature for models.

In [None]:
df['remove_rate'] = round((df['text'].str.len() - df['text_clean'].str.len())/df['length']*100,2).sort_values()

## **3.6 Translation**

As the previous section shows, some text is not written in English. I will translate the review text and app description into English.

### **3.6.1 Translation of Reviews into English**

In [None]:
# Let's see the languages
df['lang_text'].value_counts().head()

In [None]:
# create a blank column for the translated version
df["text_trans"] = np.nan

# translate all cases which are not written in English
for i in range(len(df)):
    if df.loc[i, 'lang_text'] == 'en':
        df.loc[i, 'text_trans'] = df.loc[i, 'text_clean'] # Put original version
    elif  df.loc[i, 'lang_text'] != 'en':
        df.loc[i, 'text_trans'] = GoogleTranslator(source='auto', target='en').translate(df.loc[i, 'text_clean']) # Put translated version

In [None]:
# See some examples from text which are not English.
df[df['lang_text'] != 'en'][['text_clean', 'text_trans']].sample(5)

**Note:** Check whether there is NaNs in translated review.

In [None]:
# Is there any NANs in translated review text.
df[df['text_trans'].isna()][['text',	'text_clean', 'text_trans']]

# There is one case

**Note:** Cleaning process delete all thumbs up but not the kiwi emoji. Let's correct this case with index 12374.

In [None]:
# delete 🥝 emoji
df.loc[12374, 'text_trans'] = ''

In [None]:
# Let's check
df[df['text_trans'].isna()][['text',	'text_clean', 'text_trans']]

**Note:** There is no NaNs in review text.

### **3.6.2 Translation of App Descriptions into English**

In [None]:
# create a blank column for translated version
df["desc_trans"] = np.nan

# translate all cases which are not written in English
for i in range(len(df)):
    if df.loc[i, 'lang_desc'] == 'en':
        df.loc[i, 'desc_trans'] = df.loc[i, 'description_clean'] # Put original review
    elif  df.loc[i, 'lang_desc'] != 'en':
        df.loc[i, 'desc_trans'] = GoogleTranslator(source='auto', target='en').translate(df.loc[i, 'description_clean']) # Put translated version

In [None]:
# Get some examples for checking translation
df[df['lang_desc'] != 'en'][['description_clean', 'desc_trans']].sample(5)

**Note:** There are 148 NaNs (as string) for application description. Translation process convert these NANs to 'in'. I have to convert these 'in' to ''. '' means there is no description.

In [None]:
# Check the number of 'in' in df
df[df['desc_trans'] =='in'].shape[0]

In [None]:
# Replace 'in' with ''
for i in range(len(df)):
  if df.loc[i,'desc_trans'] == 'in':
    df.loc[i,'desc_trans'] = '' 

In [None]:
# Check the number of 'in' in df
print(df[df['desc_trans'] =='in'].shape[0])

# Check NANs in df['desc_trans'] column
df[df['desc_trans']=='']['desc_trans'].shape[0]

## **3.7 Prepare Data to Find Similarity**


### **3.7.1 Remove Stopwords and Short Words**

In [None]:
def stopwords_shortwords(text):
    # filter out stop words
    words = text.split()
    stop_words = set(stopwords.words( 'english' ))
    words = [w for w in words if not w in stop_words]
    # filter out short tokens
    for word in words:
        if word.isalpha():
            words = [word for word in words if len(word) > 1 ]
        else:
            words = [word for word in words]
    return" ".join(words)

In [None]:
df['text_trans'] = df['text_trans'].apply(lambda x : stopwords_shortwords(x))

In [None]:
df['desc_trans'] = df['desc_trans'].apply(lambda x : stopwords_shortwords(x))

### **3.7.2 Tokenization**

In [None]:
df['text_trans_token'] = df['text_trans'].apply(word_tokenize)
df['desc_trans_token'] = df['desc_trans'].apply(word_tokenize)

### **3.7.3 Stemming**

In [None]:
#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

In [None]:
# Use defined function for stemming
df['text_trans_token'] = df['text_trans_token'].apply(lambda x: stemming(x))
df['desc_trans_token'] = df['desc_trans_token'].apply(lambda x: stemming(x))

## **3.8 Calculating Similarity Between Reviews and Descriptions**

In [None]:
def identity_tokenizer(text):
    return text
# Create a empty list for similarities
similarity_list = []

#Define TfIdfVectorizer
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)  

for r in range(len(df)): # Visit all rows
  try:
    # Df for each row's text and description
    df_similarity = pd.DataFrame([[df['text_trans_token'].loc[r]], [df['desc_trans_token'].loc[r]]]
                     , columns=['text'])
    # Transform two rows of column
    tdidf_vector = tfidf.fit_transform(df_similarity['text'])
    # calculate similarity
    similarities = cosine_similarity(tdidf_vector[0],tdidf_vector[1])
    # Add to the list
    similarity_list.append(similarities)
  except:
    # The cosine similarity is a number between 0 and 1
    similarity_list.append(0.001)

In [None]:
# Append list as a new column to df
df['similarity'] = similarity_list

In [None]:
# Let's see the cases whose similarity is 0.001
df[df['similarity']==0.001][['text_trans_token', 'desc_trans_token', 'similarity']]

**Note:** In these two cases both text and description is blank. 

In [None]:
# See the column
df['similarity'].head()

In [None]:
# There are extra square brackets. Remove and convert them to float.
df['similarity'] = df['similarity'].apply(lambda x: str(x).replace('[', '').replace(']',''))
df['similarity'] = df['similarity'].astype(float)

In [None]:
# See the column
df['similarity'].dtype

**Note:** Let's explore the similarity difference between fake and real reviews.

In [None]:
# Descriptive statistics of total reviews
df.groupby(["label"])["similarity"].describe()

In [None]:
# use  skewness and kurtosis to measure the shape of a distribution.
print('kurtosis:', kurtosis(df['similarity'], axis=0, bias=True))
print('skewness:', skew(df['similarity'], axis=0, bias=True))

In [None]:
# Carry out two sample t test
stats.ttest_ind(df['length'][df['label'] == 0],
                df['length'][df['label'] == 1])

## **3.9 Word Cloud**

In [None]:
# Use in google.colab to upload .png 
uploaded = files.upload()

In [None]:
# Generate a word cloud image
stopwords = set(STOPWORDS)
mask = np.array(Image.open('appstore.png'))
wordcloud = WordCloud(stopwords=stopwords, background_color = 'white', mode="RGBA", max_words=1000, mask=mask).generate(' '.join(df['body']))

# create coloring from image
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[10,10])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")

In [None]:
# download word cloud image in colab
wordcloud.to_file("word_cloud.png")
files.download('word_cloud.png')

In [None]:
# download word cloud image in jupyter
'''from IPython.display import FileLink, FileLinks

df.to_csv('/Users/handedede/Documents/GitHub/capstone_fake_review/data.csv', index=False)'''

## **3.10 Word to Vectors**

**Note:** In this part, Glove word embedding is used for converting words to vectors. There is a package called Zeugma that helps to covert.

In [None]:
# define transformer(zeugma)
glove_review = EmbeddingTransformer('glove')

***Transformation of review text***

In [None]:
df['text_trans'].sample(5)

In [None]:
# Convert review text column
X_text_glove = glove_review.transform(df['text_trans']) 

In [None]:
# Keep it into df
df_text_glove = pd.DataFrame(X_text_glove)
df_text_glove.head()

***Transformation of app description***

In [None]:
# Convert description columns
X_desc_glove = glove_review.transform(df['desc_trans'])

In [None]:
# Keep it into df
df_desc_glove = pd.DataFrame(X_desc_glove)
df_desc_glove.head()

In [None]:
# Download both df
df_text_glove.to_csv('df_text_glove.csv', encoding = 'utf-8-sig') 
files.download('df_text_glove.csv')

## **3.11 Sentence to Vectors**

**Note:** We are implementing pre-trained BERT model which is all about the context of a word in a sentence. It is a good idea to use so because the Bert's embeddings have been trained on huge text data, beyond we could accomplish with this small dataset of reviews. It is therefore much more efficient and accurate.

**Note:** Bricken ([2021](https://bricken.co/nlp_disaster_tweets_2/)) has found that heavy text data cleaning works worse when input into a BERT model because this contextual information is lost. Therefore, we have used the raw form of textual data (‘text’ and ‘description’).

In [None]:
df['text'].sample(5)

In [None]:
# Define Transformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

***Convert review text to vectors***

In [None]:
# Get all reviews in a list
reviews = df['text'][:].values
print(len(reviews))

In [None]:
# Convert sentence to vectors
review_embeddings_base = model.encode(reviews)

In [None]:
# create a df for vectors 
df_text_bert = pd.DataFrame(review_embeddings_base)

***Convert app descriptions to vectors***

In [None]:
df['description'].isna().sum()

In [None]:
# Get all reviews in a list
description = df['description'][:].values
print(len(description))

In [None]:
# Convert sentence to vectors
description_embeddings_base = model.encode(description)

In [None]:
# create a df for vectors 
df_desc_bert = pd.DataFrame(description_embeddings_base)

In [None]:
# Download both df
df_text_bert.to_csv('df_text_bert.csv', encoding = 'utf-8-sig') 
df_desc_bert.to_csv('df_desc_bert.csv', encoding = 'utf-8-sig') 
files.download('df_text_bert.csv')
files.download('df_desc_bert.csv')

## **3.12 Data Final Check Before Modelling**

**Note:** This [document](https://docs.google.com/document/d/17Xe9_NeUw4Xr1bXgHiDy1-zQT4eQ-BwDaPqQkAKby-4/edit?usp=sharing) shows the last version of features in dataframe. I check all .dtypes and NaNs, encode the categorical data type and decided  which ones to use. I will encode after train-test split.

***Edit some columns to use in modellings***

In [None]:
# df['time_diff_release_post'] in datatime. Only use days
print(df['time_diff_release_post'].sample(1))
# Take just days off from datetime object 
df['time_diff_release_post']=df['time_diff_release_post'].apply(lambda x: x.split()[0]) 

In [None]:
# Check the type
print('Before', df['time_diff_release_post'].dtype)
# convert to integer
df['time_diff_release_post'] = df['time_diff_release_post'].astype('int')
# Check the type
print('After', df['time_diff_release_post'].dtype)

In [None]:
# df['user_account_usage'] is in seconds
print(df['user_account_usage'].sample(1))
# Concert seconds into days
# 86400 seconds = 1 day
df['user_account_usage_days'] = df['user_account_usage']//86400

In [None]:
# Check the type
df['user_account_usage'].dtype

In [None]:
# There are in seconds in df['diff_init_curr_2'] column
print(df['diff_init_curr_2'].sample(1))
print('Number of NaNs:', df['diff_init_curr_2'].isna().sum())
# Take just days off from datetime object
for r in range(len(df)):
  try:
     df['diff_init_curr_2'].iloc[r] = str(df['diff_init_curr_2'].iloc[r]).split()[0]
  except:   # This is for NANs
    df['diff_init_curr_2'].iloc[r] = df['diff_init_curr_2'].iloc[r]
# Check the type
print(df['diff_init_curr_2'].dtype)
# Convert nan string to np.nan
df['diff_init_curr_2'] = df['diff_init_curr_2'].replace('nan', np.nan).sort_values()
# Check the number of NaNs
print('Number of NaNs:', df['diff_init_curr_2'].isna().sum())

## Download data for Models Notebook

In [None]:
df.shape

In [None]:
#Download the clean tokenized data
df.to_csv('df_models.csv', encoding = 'utf-8-sig') 
files.download('df_models.csv')