### 1. Import needed packages for cleaning-process

In [7]:
import re
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy

### 2. Read exported CSV file from scraping process

In [8]:
df = pd.read_csv('reviews.csv')

### 3. Drop all values which are not english, drop all duplicates and keep most relevent


In [9]:
df = df[df['lang'] == 'en']
df = df[df['source'] == 'most_relevant']
df = df.sort_values(['app_name','source'], ascending=True)
df = df.drop_duplicates(subset=['reviewId'], keep='first')

In [10]:
df['at'] = pd.to_datetime(df['at'], infer_datetime_format=True)

In [11]:
#app_grouped_df = df.groupby(['app_name','source']).size().reset_index().to_csv('first_check.csv')
    #comment: first check how many english reviews for each app


### 4. Keep 200 newest reviews for each app

In [12]:
df = df.groupby('app_name').apply(lambda x: x.sort_values(['at'],ascending=False).head(200)).reset_index(drop=True)


In [13]:
#app_grouped_df = df.groupby(['app_name','source']).size().reset_index().to_csv('second_check.csv')
    #comment: second check how many english reviews for each app

In [14]:
#df.to_csv('reviews_en_only.csv')
    #comment: export whole data file to CSV

In [63]:
#small_reviews_df = df[['app_name','at','content','score']].to_csv('small_reviews_en.csv')
    #comment: reduce data frame to relevant columns: 'app_name', 'at', 'content' and 'score' and export as CSV file

### 5. Add package to package 'spacy' for lemmatization

In [64]:
nlp = spacy.load("en_core_web_sm")

### 6. Data cleaning for the comlumn 'content'

In [65]:
df['content_lowercase'] = df['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))     #transform column content into lower case

In [66]:
def remove_emoji(text):         #remove emojis from column content
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['content_emoji'] = df['content_lowercase'].apply(lambda x: remove_emoji(x))

In [67]:
def space(comment):         #do lemmatization in column content
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])
df['content_lemma']= df['content_emoji'].apply(space)

In [68]:
df['content_punct'] = df['content_lemma'].str.replace('[^\w\s]','')     #remove punctuation from column content

  df['content_punct'] = df['content_lemma'].str.replace('[^\w\s]','') #remove punctuation from column content


In [69]:
df['content_punct'] = df['content_punct'].apply(lambda x: " ".join(x.lower() for x in x.split()))       #transform column content into lower case a second time

In [76]:
aditional_stopwords = ["app", "food", "use", "weight", "track", "calorie", "diet", "fitness", "get", "make", "really", "keep", "try", "even", "meal"]       #list of aditional stopwords
stop = stopwords.words('english')          
stop.extend(aditional_stopwords)
df['content_nonstop'] = df['content_punct'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))        #remove stopwords from column content

### 7. Export full and smaller data file as CSV 

In [77]:
df.to_csv('full_data_clean.csv')

In [78]:
small_reviews_clean_df = df[['app_name','at','content', 'content_nonstop', 'score']].to_csv('small_data_clean.csv')